lib/Target/X86/X86ISelLowering.cpp

   1 //===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//
   2 //
   3 //                     The LLVM Compiler Infrastructure
   4 //
   5 // This file is distributed under the University of Illinois Open Source
   6 // License. See LICENSE.TXT for details.
   7 //
   8 //===----------------------------------------------------------------------===//
   9 //
  10 // This file defines the interfaces that X86 uses to lower LLVM code into a
  11 // selection DAG.
  12 //
  13 //===----------------------------------------------------------------------===//
  14
  15 #define DEBUG_TYPE "x86-isel"
  16 #include "X86.h"
  17 #include "X86InstrBuilder.h"
  18 #include "X86ISelLowering.h"
  19 #include "X86MCTargetExpr.h"
  20 #include "X86TargetMachine.h"
  21 #include "X86TargetObjectFile.h"
  22 #include "llvm/CallingConv.h"
  23 #include "llvm/Constants.h"
  24 #include "llvm/DerivedTypes.h"
  25 #include "llvm/GlobalAlias.h"
  26 #include "llvm/GlobalVariable.h"
  27 #include "llvm/Function.h"
  28 #include "llvm/Instructions.h"
  29 #include "llvm/Intrinsics.h"
  30 #include "llvm/LLVMContext.h"
  31 #include "llvm/CodeGen/MachineFrameInfo.h"
  32 #include "llvm/CodeGen/MachineFunction.h"
  33 #include "llvm/CodeGen/MachineInstrBuilder.h"
  34 #include "llvm/CodeGen/MachineJumpTableInfo.h"
  35 #include "llvm/CodeGen/MachineModuleInfo.h"
  36 #include "llvm/CodeGen/MachineRegisterInfo.h"
  37 #include "llvm/CodeGen/PseudoSourceValue.h"
  38 #include "llvm/MC/MCAsmInfo.h"
  39 #include "llvm/MC/MCContext.h"
  40 #include "llvm/MC/MCSymbol.h"
  41 #include "llvm/ADT/BitVector.h"
  42 #include "llvm/ADT/SmallSet.h"
  43 #include "llvm/ADT/Statistic.h"
  44 #include "llvm/ADT/StringExtras.h"
  45 #include "llvm/ADT/VectorExtras.h"
  46 #include "llvm/Support/CommandLine.h"
  47 #include "llvm/Support/Debug.h"
  48 #include "llvm/Support/ErrorHandling.h"
  49 #include "llvm/Support/MathExtras.h"
  50 #include "llvm/Support/raw_ostream.h"
  51 using namespace llvm;
  52
  53 STATISTIC(NumTailCalls, "Number of tail calls");
  54
  55 static cl::opt<bool>
  56 DisableMMX("disable-mmx", cl::Hidden, cl::desc("Disable use of MMX"));
  57
  58 // Disable16Bit - 16-bit operations typically have a larger encoding than
  59 // corresponding 32-bit instructions, and 16-bit code is slow on some
  60 // processors. This is an experimental flag to disable 16-bit operations
  61 // (which forces them to be Legalized to 32-bit operations).
  62 static cl::opt<bool>
  63 Disable16Bit("disable-16bit", cl::Hidden,
  64              cl::desc("Disable use of 16-bit instructions"));
  65
  66 // Forward declarations.
  67 static SDValue getMOVL(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1,
  68                        SDValue V2);
  69
  70 static TargetLoweringObjectFile *createTLOF(X86TargetMachine &TM) {
  71   switch (TM.getSubtarget<X86Subtarget>().TargetType) {
  72   default: llvm_unreachable("unknown subtarget type");
  73   case X86Subtarget::isDarwin:
  74     if (TM.getSubtarget<X86Subtarget>().is64Bit())
  75       return new X8664_MachoTargetObjectFile();
  76     return new X8632_MachoTargetObjectFile();
  77   case X86Subtarget::isELF:
  78    if (TM.getSubtarget<X86Subtarget>().is64Bit())
  79      return new X8664_ELFTargetObjectFile(TM);
  80     return new X8632_ELFTargetObjectFile(TM);
  81   case X86Subtarget::isMingw:
  82   case X86Subtarget::isCygwin:
  83   case X86Subtarget::isWindows:
  84     return new TargetLoweringObjectFileCOFF();
  85   }
  86 }
  87
  88 X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
  89   : TargetLowering(TM, createTLOF(TM)) {
  90   Subtarget = &TM.getSubtarget<X86Subtarget>();
  91   X86ScalarSSEf64 = Subtarget->hasSSE2();
  92   X86ScalarSSEf32 = Subtarget->hasSSE1();
  93   X86StackPtr = Subtarget->is64Bit() ? X86::RSP : X86::ESP;
  94
  95   RegInfo = TM.getRegisterInfo();
  96   TD = getTargetData();
  97
  98   // Set up the TargetLowering object.
  99
 100   // X86 is weird, it always uses i8 for shift amounts and setcc results.
 101   setShiftAmountType(MVT::i8);
 102   setBooleanContents(ZeroOrOneBooleanContent);
 103   setSchedulingPreference(SchedulingForRegPressure);
 104   setStackPointerRegisterToSaveRestore(X86StackPtr);
 105
 106   if (Subtarget->isTargetDarwin()) {
 107     // Darwin should use _setjmp/_longjmp instead of setjmp/longjmp.
 108     setUseUnderscoreSetJmp(false);
 109     setUseUnderscoreLongJmp(false);
 110   } else if (Subtarget->isTargetMingw()) {
 111     // MS runtime is weird: it exports _setjmp, but longjmp!
 112     setUseUnderscoreSetJmp(true);
 113     setUseUnderscoreLongJmp(false);
 114   } else {
 115     setUseUnderscoreSetJmp(true);
 116     setUseUnderscoreLongJmp(true);
 117   }
 118
 119   // Set up the register classes.
 120   addRegisterClass(MVT::i8, X86::GR8RegisterClass);
 121   if (!Disable16Bit)
 122     addRegisterClass(MVT::i16, X86::GR16RegisterClass);
 123   addRegisterClass(MVT::i32, X86::GR32RegisterClass);
 124   if (Subtarget->is64Bit())
 125     addRegisterClass(MVT::i64, X86::GR64RegisterClass);
 126
 127   setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote);
 128
 129   // We don't accept any truncstore of integer registers.
 130   setTruncStoreAction(MVT::i64, MVT::i32, Expand);
 131   if (!Disable16Bit)
 132     setTruncStoreAction(MVT::i64, MVT::i16, Expand);
 133   setTruncStoreAction(MVT::i64, MVT::i8 , Expand);
 134   if (!Disable16Bit)
 135     setTruncStoreAction(MVT::i32, MVT::i16, Expand);
 136   setTruncStoreAction(MVT::i32, MVT::i8 , Expand);
 137   setTruncStoreAction(MVT::i16, MVT::i8,  Expand);
 138
 139   // SETOEQ and SETUNE require checking two conditions.
 140   setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand);
 141   setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand);
 142   setCondCodeAction(ISD::SETOEQ, MVT::f80, Expand);
 143   setCondCodeAction(ISD::SETUNE, MVT::f32, Expand);
 144   setCondCodeAction(ISD::SETUNE, MVT::f64, Expand);
 145   setCondCodeAction(ISD::SETUNE, MVT::f80, Expand);
 146
 147   // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
 148   // operation.
 149   setOperationAction(ISD::UINT_TO_FP       , MVT::i1   , Promote);
 150   setOperationAction(ISD::UINT_TO_FP       , MVT::i8   , Promote);
 151   setOperationAction(ISD::UINT_TO_FP       , MVT::i16  , Promote);
 152
 153   if (Subtarget->is64Bit()) {
 154     setOperationAction(ISD::UINT_TO_FP     , MVT::i32  , Promote);
 155     setOperationAction(ISD::UINT_TO_FP     , MVT::i64  , Expand);
 156   } else if (!UseSoftFloat) {
 157     if (X86ScalarSSEf64) {
 158       // We have an impenetrably clever algorithm for ui64->double only.
 159       setOperationAction(ISD::UINT_TO_FP   , MVT::i64  , Custom);
 160     }
 161     // We have an algorithm for SSE2, and we turn this into a 64-bit
 162     // FILD for other targets.
 163     setOperationAction(ISD::UINT_TO_FP   , MVT::i32  , Custom);
 164   }
 165
 166   // Promote i1/i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have
 167   // this operation.
 168   setOperationAction(ISD::SINT_TO_FP       , MVT::i1   , Promote);
 169   setOperationAction(ISD::SINT_TO_FP       , MVT::i8   , Promote);
 170
 171   if (!UseSoftFloat) {
 172     // SSE has no i16 to fp conversion, only i32
 173     if (X86ScalarSSEf32) {
 174       setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Promote);
 175       // f32 and f64 cases are Legal, f80 case is not
 176       setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Custom);
 177     } else {
 178       setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Custom);
 179       setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Custom);
 180     }
 181   } else {
 182     setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Promote);
 183     setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Promote);
 184   }
 185
 186   // In 32-bit mode these are custom lowered.  In 64-bit mode F32 and F64
 187   // are Legal, f80 is custom lowered.
 188   setOperationAction(ISD::FP_TO_SINT     , MVT::i64  , Custom);
 189   setOperationAction(ISD::SINT_TO_FP     , MVT::i64  , Custom);
 190
 191   // Promote i1/i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
 192   // this operation.
 193   setOperationAction(ISD::FP_TO_SINT       , MVT::i1   , Promote);
 194   setOperationAction(ISD::FP_TO_SINT       , MVT::i8   , Promote);
 195
 196   if (X86ScalarSSEf32) {
 197     setOperationAction(ISD::FP_TO_SINT     , MVT::i16  , Promote);
 198     // f32 and f64 cases are Legal, f80 case is not
 199     setOperationAction(ISD::FP_TO_SINT     , MVT::i32  , Custom);
 200   } else {
 201     setOperationAction(ISD::FP_TO_SINT     , MVT::i16  , Custom);
 202     setOperationAction(ISD::FP_TO_SINT     , MVT::i32  , Custom);
 203   }
 204
 205   // Handle FP_TO_UINT by promoting the destination to a larger signed
 206   // conversion.
 207   setOperationAction(ISD::FP_TO_UINT       , MVT::i1   , Promote);
 208   setOperationAction(ISD::FP_TO_UINT       , MVT::i8   , Promote);
 209   setOperationAction(ISD::FP_TO_UINT       , MVT::i16  , Promote);
 210
 211   if (Subtarget->is64Bit()) {
 212     setOperationAction(ISD::FP_TO_UINT     , MVT::i64  , Expand);
 213     setOperationAction(ISD::FP_TO_UINT     , MVT::i32  , Promote);
 214   } else if (!UseSoftFloat) {
 215     if (X86ScalarSSEf32 && !Subtarget->hasSSE3())
 216       // Expand FP_TO_UINT into a select.
 217       // FIXME: We would like to use a Custom expander here eventually to do
 218       // the optimal thing for SSE vs. the default expansion in the legalizer.
 219       setOperationAction(ISD::FP_TO_UINT   , MVT::i32  , Expand);
 220     else
 221       // With SSE3 we can use fisttpll to convert to a signed i64; without
 222       // SSE, we're stuck with a fistpll.
 223       setOperationAction(ISD::FP_TO_UINT   , MVT::i32  , Custom);
 224   }
 225
 226   // TODO: when we have SSE, these could be more efficient, by using movd/movq.
 227   if (!X86ScalarSSEf64) {
 228     setOperationAction(ISD::BIT_CONVERT      , MVT::f32  , Expand);
 229     setOperationAction(ISD::BIT_CONVERT      , MVT::i32  , Expand);
 230   }
 231
 232   // Scalar integer divide and remainder are lowered to use operations that
 233   // produce two results, to match the available instructions. This exposes
 234   // the two-result form to trivial CSE, which is able to combine x/y and x%y
 235   // into a single instruction.
 236   //
 237   // Scalar integer multiply-high is also lowered to use two-result
 238   // operations, to match the available instructions. However, plain multiply
 239   // (low) operations are left as Legal, as there are single-result
 240   // instructions for this in x86. Using the two-result multiply instructions
 241   // when both high and low results are needed must be arranged by dagcombine.
 242   setOperationAction(ISD::MULHS           , MVT::i8    , Expand);
 243   setOperationAction(ISD::MULHU           , MVT::i8    , Expand);
 244   setOperationAction(ISD::SDIV            , MVT::i8    , Expand);
 245   setOperationAction(ISD::UDIV            , MVT::i8    , Expand);
 246   setOperationAction(ISD::SREM            , MVT::i8    , Expand);
 247   setOperationAction(ISD::UREM            , MVT::i8    , Expand);
 248   setOperationAction(ISD::MULHS           , MVT::i16   , Expand);
 249   setOperationAction(ISD::MULHU           , MVT::i16   , Expand);
 250   setOperationAction(ISD::SDIV            , MVT::i16   , Expand);
 251   setOperationAction(ISD::UDIV            , MVT::i16   , Expand);
 252   setOperationAction(ISD::SREM            , MVT::i16   , Expand);
 253   setOperationAction(ISD::UREM            , MVT::i16   , Expand);
 254   setOperationAction(ISD::MULHS           , MVT::i32   , Expand);
 255   setOperationAction(ISD::MULHU           , MVT::i32   , Expand);
 256   setOperationAction(ISD::SDIV            , MVT::i32   , Expand);
 257   setOperationAction(ISD::UDIV            , MVT::i32   , Expand);
 258   setOperationAction(ISD::SREM            , MVT::i32   , Expand);
 259   setOperationAction(ISD::UREM            , MVT::i32   , Expand);
 260   setOperationAction(ISD::MULHS           , MVT::i64   , Expand);
 261   setOperationAction(ISD::MULHU           , MVT::i64   , Expand);
 262   setOperationAction(ISD::SDIV            , MVT::i64   , Expand);
 263   setOperationAction(ISD::UDIV            , MVT::i64   , Expand);
 264   setOperationAction(ISD::SREM            , MVT::i64   , Expand);
 265   setOperationAction(ISD::UREM            , MVT::i64   , Expand);
 266
 267   setOperationAction(ISD::BR_JT            , MVT::Other, Expand);
 268   setOperationAction(ISD::BRCOND           , MVT::Other, Custom);
 269   setOperationAction(ISD::BR_CC            , MVT::Other, Expand);
 270   setOperationAction(ISD::SELECT_CC        , MVT::Other, Expand);
 271   if (Subtarget->is64Bit())
 272     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
 273   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16  , Legal);
 274   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8   , Legal);
 275   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1   , Expand);
 276   setOperationAction(ISD::FP_ROUND_INREG   , MVT::f32  , Expand);
 277   setOperationAction(ISD::FREM             , MVT::f32  , Expand);
 278   setOperationAction(ISD::FREM             , MVT::f64  , Expand);
 279   setOperationAction(ISD::FREM             , MVT::f80  , Expand);
 280   setOperationAction(ISD::FLT_ROUNDS_      , MVT::i32  , Custom);
 281
 282   setOperationAction(ISD::CTPOP            , MVT::i8   , Expand);
 283   setOperationAction(ISD::CTTZ             , MVT::i8   , Custom);
 284   setOperationAction(ISD::CTLZ             , MVT::i8   , Custom);
 285   setOperationAction(ISD::CTPOP            , MVT::i16  , Expand);
 286   if (Disable16Bit) {
 287     setOperationAction(ISD::CTTZ           , MVT::i16  , Expand);
 288     setOperationAction(ISD::CTLZ           , MVT::i16  , Expand);
 289   } else {
 290     setOperationAction(ISD::CTTZ           , MVT::i16  , Custom);
 291     setOperationAction(ISD::CTLZ           , MVT::i16  , Custom);
 292   }
 293   setOperationAction(ISD::CTPOP            , MVT::i32  , Expand);
 294   setOperationAction(ISD::CTTZ             , MVT::i32  , Custom);
 295   setOperationAction(ISD::CTLZ             , MVT::i32  , Custom);
 296   if (Subtarget->is64Bit()) {
 297     setOperationAction(ISD::CTPOP          , MVT::i64  , Expand);
 298     setOperationAction(ISD::CTTZ           , MVT::i64  , Custom);
 299     setOperationAction(ISD::CTLZ           , MVT::i64  , Custom);
 300   }
 301
 302   setOperationAction(ISD::READCYCLECOUNTER , MVT::i64  , Custom);
 303   setOperationAction(ISD::BSWAP            , MVT::i16  , Expand);
 304
 305   // These should be promoted to a larger select which is supported.
 306   setOperationAction(ISD::SELECT          , MVT::i1   , Promote);
 307   // X86 wants to expand cmov itself.
 308   setOperationAction(ISD::SELECT          , MVT::i8   , Custom);
 309   if (Disable16Bit)
 310     setOperationAction(ISD::SELECT        , MVT::i16  , Expand);
 311   else
 312     setOperationAction(ISD::SELECT        , MVT::i16  , Custom);
 313   setOperationAction(ISD::SELECT          , MVT::i32  , Custom);
 314   setOperationAction(ISD::SELECT          , MVT::f32  , Custom);
 315   setOperationAction(ISD::SELECT          , MVT::f64  , Custom);
 316   setOperationAction(ISD::SELECT          , MVT::f80  , Custom);
 317   setOperationAction(ISD::SETCC           , MVT::i8   , Custom);
 318   if (Disable16Bit)
 319     setOperationAction(ISD::SETCC         , MVT::i16  , Expand);
 320   else
 321     setOperationAction(ISD::SETCC         , MVT::i16  , Custom);
 322   setOperationAction(ISD::SETCC           , MVT::i32  , Custom);
 323   setOperationAction(ISD::SETCC           , MVT::f32  , Custom);
 324   setOperationAction(ISD::SETCC           , MVT::f64  , Custom);
 325   setOperationAction(ISD::SETCC           , MVT::f80  , Custom);
 326   if (Subtarget->is64Bit()) {
 327     setOperationAction(ISD::SELECT        , MVT::i64  , Custom);
 328     setOperationAction(ISD::SETCC         , MVT::i64  , Custom);
 329   }
 330   setOperationAction(ISD::EH_RETURN       , MVT::Other, Custom);
 331
 332   // Darwin ABI issue.
 333   setOperationAction(ISD::ConstantPool    , MVT::i32  , Custom);
 334   setOperationAction(ISD::JumpTable       , MVT::i32  , Custom);
 335   setOperationAction(ISD::GlobalAddress   , MVT::i32  , Custom);
 336   setOperationAction(ISD::GlobalTLSAddress, MVT::i32  , Custom);
 337   if (Subtarget->is64Bit())
 338     setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom);
 339   setOperationAction(ISD::ExternalSymbol  , MVT::i32  , Custom);
 340   setOperationAction(ISD::BlockAddress    , MVT::i32  , Custom);
 341   if (Subtarget->is64Bit()) {
 342     setOperationAction(ISD::ConstantPool  , MVT::i64  , Custom);
 343     setOperationAction(ISD::JumpTable     , MVT::i64  , Custom);
 344     setOperationAction(ISD::GlobalAddress , MVT::i64  , Custom);
 345     setOperationAction(ISD::ExternalSymbol, MVT::i64  , Custom);
 346     setOperationAction(ISD::BlockAddress  , MVT::i64  , Custom);
 347   }
 348   // 64-bit addm sub, shl, sra, srl (iff 32-bit x86)
 349   setOperationAction(ISD::SHL_PARTS       , MVT::i32  , Custom);
 350   setOperationAction(ISD::SRA_PARTS       , MVT::i32  , Custom);
 351   setOperationAction(ISD::SRL_PARTS       , MVT::i32  , Custom);
 352   if (Subtarget->is64Bit()) {
 353     setOperationAction(ISD::SHL_PARTS     , MVT::i64  , Custom);
 354     setOperationAction(ISD::SRA_PARTS     , MVT::i64  , Custom);
 355     setOperationAction(ISD::SRL_PARTS     , MVT::i64  , Custom);
 356   }
 357
 358   if (Subtarget->hasSSE1())
 359     setOperationAction(ISD::PREFETCH      , MVT::Other, Legal);
 360
 361   if (!Subtarget->hasSSE2())
 362     setOperationAction(ISD::MEMBARRIER    , MVT::Other, Expand);
 363
 364   // Expand certain atomics
 365   setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i8, Custom);
 366   setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i16, Custom);
 367   setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, Custom);
 368   setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i64, Custom);
 369
 370   setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i8, Custom);
 371   setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i16, Custom);
 372   setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i32, Custom);
 373   setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i64, Custom);
 374
 375   if (!Subtarget->is64Bit()) {
 376     setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i64, Custom);
 377     setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i64, Custom);
 378     setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i64, Custom);
 379     setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i64, Custom);
 380     setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i64, Custom);
 381     setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i64, Custom);
 382     setOperationAction(ISD::ATOMIC_SWAP, MVT::i64, Custom);
 383   }
 384
 385   // FIXME - use subtarget debug flags
 386   if (!Subtarget->isTargetDarwin() &&
 387       !Subtarget->isTargetELF() &&
 388       !Subtarget->isTargetCygMing()) {
 389     setOperationAction(ISD::EH_LABEL, MVT::Other, Expand);
 390   }
 391
 392   setOperationAction(ISD::EXCEPTIONADDR, MVT::i64, Expand);
 393   setOperationAction(ISD::EHSELECTION,   MVT::i64, Expand);
 394   setOperationAction(ISD::EXCEPTIONADDR, MVT::i32, Expand);
 395   setOperationAction(ISD::EHSELECTION,   MVT::i32, Expand);
 396   if (Subtarget->is64Bit()) {
 397     setExceptionPointerRegister(X86::RAX);
 398     setExceptionSelectorRegister(X86::RDX);
 399   } else {
 400     setExceptionPointerRegister(X86::EAX);
 401     setExceptionSelectorRegister(X86::EDX);
 402   }
 403   setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom);
 404   setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom);
 405
 406   setOperationAction(ISD::TRAMPOLINE, MVT::Other, Custom);
 407
 408   setOperationAction(ISD::TRAP, MVT::Other, Legal);
 409
 410   // VASTART needs to be custom lowered to use the VarArgsFrameIndex
 411   setOperationAction(ISD::VASTART           , MVT::Other, Custom);
 412   setOperationAction(ISD::VAEND             , MVT::Other, Expand);
 413   if (Subtarget->is64Bit()) {
 414     setOperationAction(ISD::VAARG           , MVT::Other, Custom);
 415     setOperationAction(ISD::VACOPY          , MVT::Other, Custom);
 416   } else {
 417     setOperationAction(ISD::VAARG           , MVT::Other, Expand);
 418     setOperationAction(ISD::VACOPY          , MVT::Other, Expand);
 419   }
 420
 421   setOperationAction(ISD::STACKSAVE,          MVT::Other, Expand);
 422   setOperationAction(ISD::STACKRESTORE,       MVT::Other, Expand);
 423   if (Subtarget->is64Bit())
 424     setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Expand);
 425   if (Subtarget->isTargetCygMing())
 426     setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom);
 427   else
 428     setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Expand);
 429
 430   if (!UseSoftFloat && X86ScalarSSEf64) {
 431     // f32 and f64 use SSE.
 432     // Set up the FP register classes.
 433     addRegisterClass(MVT::f32, X86::FR32RegisterClass);
 434     addRegisterClass(MVT::f64, X86::FR64RegisterClass);
 435
 436     // Use ANDPD to simulate FABS.
 437     setOperationAction(ISD::FABS , MVT::f64, Custom);
 438     setOperationAction(ISD::FABS , MVT::f32, Custom);
 439
 440     // Use XORP to simulate FNEG.
 441     setOperationAction(ISD::FNEG , MVT::f64, Custom);
 442     setOperationAction(ISD::FNEG , MVT::f32, Custom);
 443
 444     // Use ANDPD and ORPD to simulate FCOPYSIGN.
 445     setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom);
 446     setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
 447
 448     // We don't support sin/cos/fmod
 449     setOperationAction(ISD::FSIN , MVT::f64, Expand);
 450     setOperationAction(ISD::FCOS , MVT::f64, Expand);
 451     setOperationAction(ISD::FSIN , MVT::f32, Expand);
 452     setOperationAction(ISD::FCOS , MVT::f32, Expand);
 453
 454     // Expand FP immediates into loads from the stack, except for the special
 455     // cases we handle.
 456     addLegalFPImmediate(APFloat(+0.0)); // xorpd
 457     addLegalFPImmediate(APFloat(+0.0f)); // xorps
 458   } else if (!UseSoftFloat && X86ScalarSSEf32) {
 459     // Use SSE for f32, x87 for f64.
 460     // Set up the FP register classes.
 461     addRegisterClass(MVT::f32, X86::FR32RegisterClass);
 462     addRegisterClass(MVT::f64, X86::RFP64RegisterClass);
 463
 464     // Use ANDPS to simulate FABS.
 465     setOperationAction(ISD::FABS , MVT::f32, Custom);
 466
 467     // Use XORP to simulate FNEG.
 468     setOperationAction(ISD::FNEG , MVT::f32, Custom);
 469
 470     setOperationAction(ISD::UNDEF,     MVT::f64, Expand);
 471
 472     // Use ANDPS and ORPS to simulate FCOPYSIGN.
 473     setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
 474     setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
 475
 476     // We don't support sin/cos/fmod
 477     setOperationAction(ISD::FSIN , MVT::f32, Expand);
 478     setOperationAction(ISD::FCOS , MVT::f32, Expand);
 479
 480     // Special cases we handle for FP constants.
 481     addLegalFPImmediate(APFloat(+0.0f)); // xorps
 482     addLegalFPImmediate(APFloat(+0.0)); // FLD0
 483     addLegalFPImmediate(APFloat(+1.0)); // FLD1
 484     addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
 485     addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
 486
 487     if (!UnsafeFPMath) {
 488       setOperationAction(ISD::FSIN           , MVT::f64  , Expand);
 489       setOperationAction(ISD::FCOS           , MVT::f64  , Expand);
 490     }
 491   } else if (!UseSoftFloat) {
 492     // f32 and f64 in x87.
 493     // Set up the FP register classes.
 494     addRegisterClass(MVT::f64, X86::RFP64RegisterClass);
 495     addRegisterClass(MVT::f32, X86::RFP32RegisterClass);
 496
 497     setOperationAction(ISD::UNDEF,     MVT::f64, Expand);
 498     setOperationAction(ISD::UNDEF,     MVT::f32, Expand);
 499     setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
 500     setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
 501
 502     if (!UnsafeFPMath) {
 503       setOperationAction(ISD::FSIN           , MVT::f64  , Expand);
 504       setOperationAction(ISD::FCOS           , MVT::f64  , Expand);
 505     }
 506     addLegalFPImmediate(APFloat(+0.0)); // FLD0
 507     addLegalFPImmediate(APFloat(+1.0)); // FLD1
 508     addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
 509     addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
 510     addLegalFPImmediate(APFloat(+0.0f)); // FLD0
 511     addLegalFPImmediate(APFloat(+1.0f)); // FLD1
 512     addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS
 513     addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS
 514   }
 515
 516   // Long double always uses X87.
 517   if (!UseSoftFloat) {
 518     addRegisterClass(MVT::f80, X86::RFP80RegisterClass);
 519     setOperationAction(ISD::UNDEF,     MVT::f80, Expand);
 520     setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand);
 521     {
 522       bool ignored;
 523       APFloat TmpFlt(+0.0);
 524       TmpFlt.convert(APFloat::x87DoubleExtended, APFloat::rmNearestTiesToEven,
 525                      &ignored);
 526       addLegalFPImmediate(TmpFlt);  // FLD0
 527       TmpFlt.changeSign();
 528       addLegalFPImmediate(TmpFlt);  // FLD0/FCHS
 529       APFloat TmpFlt2(+1.0);
 530       TmpFlt2.convert(APFloat::x87DoubleExtended, APFloat::rmNearestTiesToEven,
 531                       &ignored);
 532       addLegalFPImmediate(TmpFlt2);  // FLD1
 533       TmpFlt2.changeSign();
 534       addLegalFPImmediate(TmpFlt2);  // FLD1/FCHS
 535     }
 536
 537     if (!UnsafeFPMath) {
 538       setOperationAction(ISD::FSIN           , MVT::f80  , Expand);
 539       setOperationAction(ISD::FCOS           , MVT::f80  , Expand);
 540     }
 541   }
 542
 543   // Always use a library call for pow.
 544   setOperationAction(ISD::FPOW             , MVT::f32  , Expand);
 545   setOperationAction(ISD::FPOW             , MVT::f64  , Expand);
 546   setOperationAction(ISD::FPOW             , MVT::f80  , Expand);
 547
 548   setOperationAction(ISD::FLOG, MVT::f80, Expand);
 549   setOperationAction(ISD::FLOG2, MVT::f80, Expand);
 550   setOperationAction(ISD::FLOG10, MVT::f80, Expand);
 551   setOperationAction(ISD::FEXP, MVT::f80, Expand);
 552   setOperationAction(ISD::FEXP2, MVT::f80, Expand);
 553
 554   // First set operation action for all vector types to either promote
 555   // (for widening) or expand (for scalarization). Then we will selectively
 556   // turn on ones that can be effectively codegen'd.
 557   for (unsigned VT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
 558        VT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++VT) {
 559     setOperationAction(ISD::ADD , (MVT::SimpleValueType)VT, Expand);
 560     setOperationAction(ISD::SUB , (MVT::SimpleValueType)VT, Expand);
 561     setOperationAction(ISD::FADD, (MVT::SimpleValueType)VT, Expand);
 562     setOperationAction(ISD::FNEG, (MVT::SimpleValueType)VT, Expand);
 563     setOperationAction(ISD::FSUB, (MVT::SimpleValueType)VT, Expand);
 564     setOperationAction(ISD::MUL , (MVT::SimpleValueType)VT, Expand);
 565     setOperationAction(ISD::FMUL, (MVT::SimpleValueType)VT, Expand);
 566     setOperationAction(ISD::SDIV, (MVT::SimpleValueType)VT, Expand);
 567     setOperationAction(ISD::UDIV, (MVT::SimpleValueType)VT, Expand);
 568     setOperationAction(ISD::FDIV, (MVT::SimpleValueType)VT, Expand);
 569     setOperationAction(ISD::SREM, (MVT::SimpleValueType)VT, Expand);
 570     setOperationAction(ISD::UREM, (MVT::SimpleValueType)VT, Expand);
 571     setOperationAction(ISD::LOAD, (MVT::SimpleValueType)VT, Expand);
 572     setOperationAction(ISD::VECTOR_SHUFFLE, (MVT::SimpleValueType)VT, Expand);
 573     setOperationAction(ISD::EXTRACT_VECTOR_ELT,(MVT::SimpleValueType)VT,Expand);
 574     setOperationAction(ISD::EXTRACT_SUBVECTOR,(MVT::SimpleValueType)VT,Expand);
 575     setOperationAction(ISD::INSERT_VECTOR_ELT,(MVT::SimpleValueType)VT, Expand);
 576     setOperationAction(ISD::FABS, (MVT::SimpleValueType)VT, Expand);
 577     setOperationAction(ISD::FSIN, (MVT::SimpleValueType)VT, Expand);
 578     setOperationAction(ISD::FCOS, (MVT::SimpleValueType)VT, Expand);
 579     setOperationAction(ISD::FREM, (MVT::SimpleValueType)VT, Expand);
 580     setOperationAction(ISD::FPOWI, (MVT::SimpleValueType)VT, Expand);
 581     setOperationAction(ISD::FSQRT, (MVT::SimpleValueType)VT, Expand);
 582     setOperationAction(ISD::FCOPYSIGN, (MVT::SimpleValueType)VT, Expand);
 583     setOperationAction(ISD::SMUL_LOHI, (MVT::SimpleValueType)VT, Expand);
 584     setOperationAction(ISD::UMUL_LOHI, (MVT::SimpleValueType)VT, Expand);
 585     setOperationAction(ISD::SDIVREM, (MVT::SimpleValueType)VT, Expand);
 586     setOperationAction(ISD::UDIVREM, (MVT::SimpleValueType)VT, Expand);
 587     setOperationAction(ISD::FPOW, (MVT::SimpleValueType)VT, Expand);
 588     setOperationAction(ISD::CTPOP, (MVT::SimpleValueType)VT, Expand);
 589     setOperationAction(ISD::CTTZ, (MVT::SimpleValueType)VT, Expand);
 590     setOperationAction(ISD::CTLZ, (MVT::SimpleValueType)VT, Expand);
 591     setOperationAction(ISD::SHL, (MVT::SimpleValueType)VT, Expand);
 592     setOperationAction(ISD::SRA, (MVT::SimpleValueType)VT, Expand);
 593     setOperationAction(ISD::SRL, (MVT::SimpleValueType)VT, Expand);
 594     setOperationAction(ISD::ROTL, (MVT::SimpleValueType)VT, Expand);
 595     setOperationAction(ISD::ROTR, (MVT::SimpleValueType)VT, Expand);
 596     setOperationAction(ISD::BSWAP, (MVT::SimpleValueType)VT, Expand);
 597     setOperationAction(ISD::VSETCC, (MVT::SimpleValueType)VT, Expand);
 598     setOperationAction(ISD::FLOG, (MVT::SimpleValueType)VT, Expand);
 599     setOperationAction(ISD::FLOG2, (MVT::SimpleValueType)VT, Expand);
 600     setOperationAction(ISD::FLOG10, (MVT::SimpleValueType)VT, Expand);
 601     setOperationAction(ISD::FEXP, (MVT::SimpleValueType)VT, Expand);
 602     setOperationAction(ISD::FEXP2, (MVT::SimpleValueType)VT, Expand);
 603     setOperationAction(ISD::FP_TO_UINT, (MVT::SimpleValueType)VT, Expand);
 604     setOperationAction(ISD::FP_TO_SINT, (MVT::SimpleValueType)VT, Expand);
 605     setOperationAction(ISD::UINT_TO_FP, (MVT::SimpleValueType)VT, Expand);
 606     setOperationAction(ISD::SINT_TO_FP, (MVT::SimpleValueType)VT, Expand);
 607     setOperationAction(ISD::SIGN_EXTEND_INREG, (MVT::SimpleValueType)VT,Expand);
 608     setOperationAction(ISD::TRUNCATE,  (MVT::SimpleValueType)VT, Expand);
 609     setOperationAction(ISD::SIGN_EXTEND,  (MVT::SimpleValueType)VT, Expand);
 610     setOperationAction(ISD::ZERO_EXTEND,  (MVT::SimpleValueType)VT, Expand);
 611     setOperationAction(ISD::ANY_EXTEND,  (MVT::SimpleValueType)VT, Expand);
 612     for (unsigned InnerVT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
 613          InnerVT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++InnerVT)
 614       setTruncStoreAction((MVT::SimpleValueType)VT,
 615                           (MVT::SimpleValueType)InnerVT, Expand);
 616     setLoadExtAction(ISD::SEXTLOAD, (MVT::SimpleValueType)VT, Expand);
 617     setLoadExtAction(ISD::ZEXTLOAD, (MVT::SimpleValueType)VT, Expand);
 618     setLoadExtAction(ISD::EXTLOAD, (MVT::SimpleValueType)VT, Expand);
 619   }
 620
 621   // FIXME: In order to prevent SSE instructions being expanded to MMX ones
 622   // with -msoft-float, disable use of MMX as well.
 623   if (!UseSoftFloat && !DisableMMX && Subtarget->hasMMX()) {
 624     addRegisterClass(MVT::v8i8,  X86::VR64RegisterClass);
 625     addRegisterClass(MVT::v4i16, X86::VR64RegisterClass);
 626     addRegisterClass(MVT::v2i32, X86::VR64RegisterClass);
 627     addRegisterClass(MVT::v2f32, X86::VR64RegisterClass);
 628     addRegisterClass(MVT::v1i64, X86::VR64RegisterClass);
 629
 630     setOperationAction(ISD::ADD,                MVT::v8i8,  Legal);
 631     setOperationAction(ISD::ADD,                MVT::v4i16, Legal);
 632     setOperationAction(ISD::ADD,                MVT::v2i32, Legal);
 633     setOperationAction(ISD::ADD,                MVT::v1i64, Legal);
 634
 635     setOperationAction(ISD::SUB,                MVT::v8i8,  Legal);
 636     setOperationAction(ISD::SUB,                MVT::v4i16, Legal);
 637     setOperationAction(ISD::SUB,                MVT::v2i32, Legal);
 638     setOperationAction(ISD::SUB,                MVT::v1i64, Legal);
 639
 640     setOperationAction(ISD::MULHS,              MVT::v4i16, Legal);
 641     setOperationAction(ISD::MUL,                MVT::v4i16, Legal);
 642
 643     setOperationAction(ISD::AND,                MVT::v8i8,  Promote);
 644     AddPromotedToType (ISD::AND,                MVT::v8i8,  MVT::v1i64);
 645     setOperationAction(ISD::AND,                MVT::v4i16, Promote);
 646     AddPromotedToType (ISD::AND,                MVT::v4i16, MVT::v1i64);
 647     setOperationAction(ISD::AND,                MVT::v2i32, Promote);
 648     AddPromotedToType (ISD::AND,                MVT::v2i32, MVT::v1i64);
 649     setOperationAction(ISD::AND,                MVT::v1i64, Legal);
 650
 651     setOperationAction(ISD::OR,                 MVT::v8i8,  Promote);
 652     AddPromotedToType (ISD::OR,                 MVT::v8i8,  MVT::v1i64);
 653     setOperationAction(ISD::OR,                 MVT::v4i16, Promote);
 654     AddPromotedToType (ISD::OR,                 MVT::v4i16, MVT::v1i64);
 655     setOperationAction(ISD::OR,                 MVT::v2i32, Promote);
 656     AddPromotedToType (ISD::OR,                 MVT::v2i32, MVT::v1i64);
 657     setOperationAction(ISD::OR,                 MVT::v1i64, Legal);
 658
 659     setOperationAction(ISD::XOR,                MVT::v8i8,  Promote);
 660     AddPromotedToType (ISD::XOR,                MVT::v8i8,  MVT::v1i64);
 661     setOperationAction(ISD::XOR,                MVT::v4i16, Promote);
 662     AddPromotedToType (ISD::XOR,                MVT::v4i16, MVT::v1i64);
 663     setOperationAction(ISD::XOR,                MVT::v2i32, Promote);
 664     AddPromotedToType (ISD::XOR,                MVT::v2i32, MVT::v1i64);
 665     setOperationAction(ISD::XOR,                MVT::v1i64, Legal);
 666
 667     setOperationAction(ISD::LOAD,               MVT::v8i8,  Promote);
 668     AddPromotedToType (ISD::LOAD,               MVT::v8i8,  MVT::v1i64);
 669     setOperationAction(ISD::LOAD,               MVT::v4i16, Promote);
 670     AddPromotedToType (ISD::LOAD,               MVT::v4i16, MVT::v1i64);
 671     setOperationAction(ISD::LOAD,               MVT::v2i32, Promote);
 672     AddPromotedToType (ISD::LOAD,               MVT::v2i32, MVT::v1i64);
 673     setOperationAction(ISD::LOAD,               MVT::v2f32, Promote);
 674     AddPromotedToType (ISD::LOAD,               MVT::v2f32, MVT::v1i64);
 675     setOperationAction(ISD::LOAD,               MVT::v1i64, Legal);
 676
 677     setOperationAction(ISD::BUILD_VECTOR,       MVT::v8i8,  Custom);
 678     setOperationAction(ISD::BUILD_VECTOR,       MVT::v4i16, Custom);
 679     setOperationAction(ISD::BUILD_VECTOR,       MVT::v2i32, Custom);
 680     setOperationAction(ISD::BUILD_VECTOR,       MVT::v2f32, Custom);
 681     setOperationAction(ISD::BUILD_VECTOR,       MVT::v1i64, Custom);
 682
 683     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v8i8,  Custom);
 684     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v4i16, Custom);
 685     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v2i32, Custom);
 686     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v1i64, Custom);
 687
 688     setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v2f32, Custom);
 689     setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v8i8,  Custom);
 690     setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v4i16, Custom);
 691     setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v1i64, Custom);
 692
 693     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4i16, Custom);
 694
 695     setOperationAction(ISD::SELECT,             MVT::v8i8, Promote);
 696     setOperationAction(ISD::SELECT,             MVT::v4i16, Promote);
 697     setOperationAction(ISD::SELECT,             MVT::v2i32, Promote);
 698     setOperationAction(ISD::SELECT,             MVT::v1i64, Custom);
 699     setOperationAction(ISD::VSETCC,             MVT::v8i8, Custom);
 700     setOperationAction(ISD::VSETCC,             MVT::v4i16, Custom);
 701     setOperationAction(ISD::VSETCC,             MVT::v2i32, Custom);
 702   }
 703
 704   if (!UseSoftFloat && Subtarget->hasSSE1()) {
 705     addRegisterClass(MVT::v4f32, X86::VR128RegisterClass);
 706
 707     setOperationAction(ISD::FADD,               MVT::v4f32, Legal);
 708     setOperationAction(ISD::FSUB,               MVT::v4f32, Legal);
 709     setOperationAction(ISD::FMUL,               MVT::v4f32, Legal);
 710     setOperationAction(ISD::FDIV,               MVT::v4f32, Legal);
 711     setOperationAction(ISD::FSQRT,              MVT::v4f32, Legal);
 712     setOperationAction(ISD::FNEG,               MVT::v4f32, Custom);
 713     setOperationAction(ISD::LOAD,               MVT::v4f32, Legal);
 714     setOperationAction(ISD::BUILD_VECTOR,       MVT::v4f32, Custom);
 715     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v4f32, Custom);
 716     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
 717     setOperationAction(ISD::SELECT,             MVT::v4f32, Custom);
 718     setOperationAction(ISD::VSETCC,             MVT::v4f32, Custom);
 719   }
 720
 721   if (!UseSoftFloat && Subtarget->hasSSE2()) {
 722     addRegisterClass(MVT::v2f64, X86::VR128RegisterClass);
 723
 724     // FIXME: Unfortunately -soft-float and -no-implicit-float means XMM
 725     // registers cannot be used even for integer operations.
 726     addRegisterClass(MVT::v16i8, X86::VR128RegisterClass);
 727     addRegisterClass(MVT::v8i16, X86::VR128RegisterClass);
 728     addRegisterClass(MVT::v4i32, X86::VR128RegisterClass);
 729     addRegisterClass(MVT::v2i64, X86::VR128RegisterClass);
 730
 731     setOperationAction(ISD::ADD,                MVT::v16i8, Legal);
 732     setOperationAction(ISD::ADD,                MVT::v8i16, Legal);
 733     setOperationAction(ISD::ADD,                MVT::v4i32, Legal);
 734     setOperationAction(ISD::ADD,                MVT::v2i64, Legal);
 735     setOperationAction(ISD::MUL,                MVT::v2i64, Custom);
 736     setOperationAction(ISD::SUB,                MVT::v16i8, Legal);
 737     setOperationAction(ISD::SUB,                MVT::v8i16, Legal);
 738     setOperationAction(ISD::SUB,                MVT::v4i32, Legal);
 739     setOperationAction(ISD::SUB,                MVT::v2i64, Legal);
 740     setOperationAction(ISD::MUL,                MVT::v8i16, Legal);
 741     setOperationAction(ISD::FADD,               MVT::v2f64, Legal);
 742     setOperationAction(ISD::FSUB,               MVT::v2f64, Legal);
 743     setOperationAction(ISD::FMUL,               MVT::v2f64, Legal);
 744     setOperationAction(ISD::FDIV,               MVT::v2f64, Legal);
 745     setOperationAction(ISD::FSQRT,              MVT::v2f64, Legal);
 746     setOperationAction(ISD::FNEG,               MVT::v2f64, Custom);
 747
 748     setOperationAction(ISD::VSETCC,             MVT::v2f64, Custom);
 749     setOperationAction(ISD::VSETCC,             MVT::v16i8, Custom);
 750     setOperationAction(ISD::VSETCC,             MVT::v8i16, Custom);
 751     setOperationAction(ISD::VSETCC,             MVT::v4i32, Custom);
 752
 753     setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v16i8, Custom);
 754     setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v8i16, Custom);
 755     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8i16, Custom);
 756     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4i32, Custom);
 757     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4f32, Custom);
 758
 759     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v2f64, Custom);
 760     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v2i64, Custom);
 761     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v16i8, Custom);
 762     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v8i16, Custom);
 763     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v4i32, Custom);
 764
 765     // Custom lower build_vector, vector_shuffle, and extract_vector_elt.
 766     for (unsigned i = (unsigned)MVT::v16i8; i != (unsigned)MVT::v2i64; ++i) {
 767       EVT VT = (MVT::SimpleValueType)i;
 768       // Do not attempt to custom lower non-power-of-2 vectors
 769       if (!isPowerOf2_32(VT.getVectorNumElements()))
 770         continue;
 771       // Do not attempt to custom lower non-128-bit vectors
 772       if (!VT.is128BitVector())
 773         continue;
 774       setOperationAction(ISD::BUILD_VECTOR,
 775                          VT.getSimpleVT().SimpleTy, Custom);
 776       setOperationAction(ISD::VECTOR_SHUFFLE,
 777                          VT.getSimpleVT().SimpleTy, Custom);
 778       setOperationAction(ISD::EXTRACT_VECTOR_ELT,
 779                          VT.getSimpleVT().SimpleTy, Custom);
 780     }
 781
 782     setOperationAction(ISD::BUILD_VECTOR,       MVT::v2f64, Custom);
 783     setOperationAction(ISD::BUILD_VECTOR,       MVT::v2i64, Custom);
 784     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v2f64, Custom);
 785     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v2i64, Custom);
 786     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v2f64, Custom);
 787     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Custom);
 788
 789     if (Subtarget->is64Bit()) {
 790       setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v2i64, Custom);
 791       setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom);
 792     }
 793
 794     // Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64.
 795     for (unsigned i = (unsigned)MVT::v16i8; i != (unsigned)MVT::v2i64; i++) {
 796       MVT::SimpleValueType SVT = (MVT::SimpleValueType)i;
 797       EVT VT = SVT;
 798
 799       // Do not attempt to promote non-128-bit vectors
 800       if (!VT.is128BitVector()) {
 801         continue;
 802       }
 803       setOperationAction(ISD::AND,    SVT, Promote);
 804       AddPromotedToType (ISD::AND,    SVT, MVT::v2i64);
 805       setOperationAction(ISD::OR,     SVT, Promote);
 806       AddPromotedToType (ISD::OR,     SVT, MVT::v2i64);
 807       setOperationAction(ISD::XOR,    SVT, Promote);
 808       AddPromotedToType (ISD::XOR,    SVT, MVT::v2i64);
 809       setOperationAction(ISD::LOAD,   SVT, Promote);
 810       AddPromotedToType (ISD::LOAD,   SVT, MVT::v2i64);
 811       setOperationAction(ISD::SELECT, SVT, Promote);
 812       AddPromotedToType (ISD::SELECT, SVT, MVT::v2i64);
 813     }
 814
 815     setTruncStoreAction(MVT::f64, MVT::f32, Expand);
 816
 817     // Custom lower v2i64 and v2f64 selects.
 818     setOperationAction(ISD::LOAD,               MVT::v2f64, Legal);
 819     setOperationAction(ISD::LOAD,               MVT::v2i64, Legal);
 820     setOperationAction(ISD::SELECT,             MVT::v2f64, Custom);
 821     setOperationAction(ISD::SELECT,             MVT::v2i64, Custom);
 822
 823     setOperationAction(ISD::FP_TO_SINT,         MVT::v4i32, Legal);
 824     setOperationAction(ISD::SINT_TO_FP,         MVT::v4i32, Legal);
 825     if (!DisableMMX && Subtarget->hasMMX()) {
 826       setOperationAction(ISD::FP_TO_SINT,         MVT::v2i32, Custom);
 827       setOperationAction(ISD::SINT_TO_FP,         MVT::v2i32, Custom);
 828     }
 829   }
 830
 831   if (Subtarget->hasSSE41()) {
 832     // FIXME: Do we need to handle scalar-to-vector here?
 833     setOperationAction(ISD::MUL,                MVT::v4i32, Legal);
 834
 835     // i8 and i16 vectors are custom , because the source register and source
 836     // source memory operand types are not the same width.  f32 vectors are
 837     // custom since the immediate controlling the insert encodes additional
 838     // information.
 839     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v16i8, Custom);
 840     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8i16, Custom);
 841     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4i32, Custom);
 842     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4f32, Custom);
 843
 844     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i8, Custom);
 845     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i16, Custom);
 846     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom);
 847     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
 848
 849     if (Subtarget->is64Bit()) {
 850       setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v2i64, Legal);
 851       setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Legal);
 852     }
 853   }
 854
 855   if (Subtarget->hasSSE42()) {
 856     setOperationAction(ISD::VSETCC,             MVT::v2i64, Custom);
 857   }
 858
 859   if (!UseSoftFloat && Subtarget->hasAVX()) {
 860     addRegisterClass(MVT::v8f32, X86::VR256RegisterClass);
 861     addRegisterClass(MVT::v4f64, X86::VR256RegisterClass);
 862     addRegisterClass(MVT::v8i32, X86::VR256RegisterClass);
 863     addRegisterClass(MVT::v4i64, X86::VR256RegisterClass);
 864
 865     setOperationAction(ISD::LOAD,               MVT::v8f32, Legal);
 866     setOperationAction(ISD::LOAD,               MVT::v8i32, Legal);
 867     setOperationAction(ISD::LOAD,               MVT::v4f64, Legal);
 868     setOperationAction(ISD::LOAD,               MVT::v4i64, Legal);
 869     setOperationAction(ISD::FADD,               MVT::v8f32, Legal);
 870     setOperationAction(ISD::FSUB,               MVT::v8f32, Legal);
 871     setOperationAction(ISD::FMUL,               MVT::v8f32, Legal);
 872     setOperationAction(ISD::FDIV,               MVT::v8f32, Legal);
 873     setOperationAction(ISD::FSQRT,              MVT::v8f32, Legal);
 874     setOperationAction(ISD::FNEG,               MVT::v8f32, Custom);
 875     //setOperationAction(ISD::BUILD_VECTOR,       MVT::v8f32, Custom);
 876     //setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v8f32, Custom);
 877     //setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8f32, Custom);
 878     //setOperationAction(ISD::SELECT,             MVT::v8f32, Custom);
 879     //setOperationAction(ISD::VSETCC,             MVT::v8f32, Custom);
 880
 881     // Operations to consider commented out -v16i16 v32i8
 882     //setOperationAction(ISD::ADD,                MVT::v16i16, Legal);
 883     setOperationAction(ISD::ADD,                MVT::v8i32, Custom);
 884     setOperationAction(ISD::ADD,                MVT::v4i64, Custom);
 885     //setOperationAction(ISD::SUB,                MVT::v32i8, Legal);
 886     //setOperationAction(ISD::SUB,                MVT::v16i16, Legal);
 887     setOperationAction(ISD::SUB,                MVT::v8i32, Custom);
 888     setOperationAction(ISD::SUB,                MVT::v4i64, Custom);
 889     //setOperationAction(ISD::MUL,                MVT::v16i16, Legal);
 890     setOperationAction(ISD::FADD,               MVT::v4f64, Legal);
 891     setOperationAction(ISD::FSUB,               MVT::v4f64, Legal);
 892     setOperationAction(ISD::FMUL,               MVT::v4f64, Legal);
 893     setOperationAction(ISD::FDIV,               MVT::v4f64, Legal);
 894     setOperationAction(ISD::FSQRT,              MVT::v4f64, Legal);
 895     setOperationAction(ISD::FNEG,               MVT::v4f64, Custom);
 896
 897     setOperationAction(ISD::VSETCC,             MVT::v4f64, Custom);
 898     // setOperationAction(ISD::VSETCC,             MVT::v32i8, Custom);
 899     // setOperationAction(ISD::VSETCC,             MVT::v16i16, Custom);
 900     setOperationAction(ISD::VSETCC,             MVT::v8i32, Custom);
 901
 902     // setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v32i8, Custom);
 903     // setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v16i16, Custom);
 904     // setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v16i16, Custom);
 905     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8i32, Custom);
 906     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8f32, Custom);
 907
 908     setOperationAction(ISD::BUILD_VECTOR,       MVT::v4f64, Custom);
 909     setOperationAction(ISD::BUILD_VECTOR,       MVT::v4i64, Custom);
 910     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v4f64, Custom);
 911     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v4i64, Custom);
 912     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4f64, Custom);
 913     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f64, Custom);
 914
 915 #if 0
 916     // Not sure we want to do this since there are no 256-bit integer
 917     // operations in AVX
 918
 919     // Custom lower build_vector, vector_shuffle, and extract_vector_elt.
 920     // This includes 256-bit vectors
 921     for (unsigned i = (unsigned)MVT::v16i8; i != (unsigned)MVT::v4i64; ++i) {
 922       EVT VT = (MVT::SimpleValueType)i;
 923
 924       // Do not attempt to custom lower non-power-of-2 vectors
 925       if (!isPowerOf2_32(VT.getVectorNumElements()))
 926         continue;
 927
 928       setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
 929       setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
 930       setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
 931     }
 932
 933     if (Subtarget->is64Bit()) {
 934       setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4i64, Custom);
 935       setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i64, Custom);
 936     }
 937 #endif
 938
 939 #if 0
 940     // Not sure we want to do this since there are no 256-bit integer
 941     // operations in AVX
 942
 943     // Promote v32i8, v16i16, v8i32 load, select, and, or, xor to v4i64.
 944     // Including 256-bit vectors
 945     for (unsigned i = (unsigned)MVT::v16i8; i != (unsigned)MVT::v4i64; i++) {
 946       EVT VT = (MVT::SimpleValueType)i;
 947
 948       if (!VT.is256BitVector()) {
 949         continue;
 950       }
 951       setOperationAction(ISD::AND,    VT, Promote);
 952       AddPromotedToType (ISD::AND,    VT, MVT::v4i64);
 953       setOperationAction(ISD::OR,     VT, Promote);
 954       AddPromotedToType (ISD::OR,     VT, MVT::v4i64);
 955       setOperationAction(ISD::XOR,    VT, Promote);
 956       AddPromotedToType (ISD::XOR,    VT, MVT::v4i64);
 957       setOperationAction(ISD::LOAD,   VT, Promote);
 958       AddPromotedToType (ISD::LOAD,   VT, MVT::v4i64);
 959       setOperationAction(ISD::SELECT, VT, Promote);
 960       AddPromotedToType (ISD::SELECT, VT, MVT::v4i64);
 961     }
 962
 963     setTruncStoreAction(MVT::f64, MVT::f32, Expand);
 964 #endif
 965   }
 966
 967   // We want to custom lower some of our intrinsics.
 968   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
 969
 970   // Add/Sub/Mul with overflow operations are custom lowered.
 971   setOperationAction(ISD::SADDO, MVT::i32, Custom);
 972   setOperationAction(ISD::SADDO, MVT::i64, Custom);
 973   setOperationAction(ISD::UADDO, MVT::i32, Custom);
 974   setOperationAction(ISD::UADDO, MVT::i64, Custom);
 975   setOperationAction(ISD::SSUBO, MVT::i32, Custom);
 976   setOperationAction(ISD::SSUBO, MVT::i64, Custom);
 977   setOperationAction(ISD::USUBO, MVT::i32, Custom);
 978   setOperationAction(ISD::USUBO, MVT::i64, Custom);
 979   setOperationAction(ISD::SMULO, MVT::i32, Custom);
 980   setOperationAction(ISD::SMULO, MVT::i64, Custom);
 981
 982   if (!Subtarget->is64Bit()) {
 983     // These libcalls are not available in 32-bit.
 984     setLibcallName(RTLIB::SHL_I128, 0);
 985     setLibcallName(RTLIB::SRL_I128, 0);
 986     setLibcallName(RTLIB::SRA_I128, 0);
 987   }
 988
 989   // We have target-specific dag combine patterns for the following nodes:
 990   setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
 991   setTargetDAGCombine(ISD::BUILD_VECTOR);
 992   setTargetDAGCombine(ISD::SELECT);
 993   setTargetDAGCombine(ISD::SHL);
 994   setTargetDAGCombine(ISD::SRA);
 995   setTargetDAGCombine(ISD::SRL);
 996   setTargetDAGCombine(ISD::OR);
 997   setTargetDAGCombine(ISD::STORE);
 998   setTargetDAGCombine(ISD::MEMBARRIER);
 999   setTargetDAGCombine(ISD::ZERO_EXTEND);
1000   if (Subtarget->is64Bit())
1001     setTargetDAGCombine(ISD::MUL);
1002
1003   computeRegisterProperties();
1004
1005   // FIXME: These should be based on subtarget info. Plus, the values should
1006   // be smaller when we are in optimizing for size mode.
1007   maxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
1008   maxStoresPerMemcpy = 16; // For @llvm.memcpy -> sequence of stores
1009   maxStoresPerMemmove = 3; // For @llvm.memmove -> sequence of stores
1010   setPrefLoopAlignment(16);
1011   benefitFromCodePlacementOpt = true;
1012 }
1013
1014
1015 MVT::SimpleValueType X86TargetLowering::getSetCCResultType(EVT VT) const {
1016   return MVT::i8;
1017 }
1018
1019
1020 /// getMaxByValAlign - Helper for getByValTypeAlignment to determine
1021 /// the desired ByVal argument alignment.
1022 static void getMaxByValAlign(const Type *Ty, unsigned &MaxAlign) {
1023   if (MaxAlign == 16)
1024     return;
1025   if (const VectorType *VTy = dyn_cast<VectorType>(Ty)) {
1026     if (VTy->getBitWidth() == 128)
1027       MaxAlign = 16;
1028   } else if (const ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
1029     unsigned EltAlign = 0;
1030     getMaxByValAlign(ATy->getElementType(), EltAlign);
1031     if (EltAlign > MaxAlign)
1032       MaxAlign = EltAlign;
1033   } else if (const StructType *STy = dyn_cast<StructType>(Ty)) {
1034     for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
1035       unsigned EltAlign = 0;
1036       getMaxByValAlign(STy->getElementType(i), EltAlign);
1037       if (EltAlign > MaxAlign)
1038         MaxAlign = EltAlign;
1039       if (MaxAlign == 16)
1040         break;
1041     }
1042   }
1043   return;
1044 }
1045
1046 /// getByValTypeAlignment - Return the desired alignment for ByVal aggregate
1047 /// function arguments in the caller parameter area. For X86, aggregates
1048 /// that contain SSE vectors are placed at 16-byte boundaries while the rest
1049 /// are at 4-byte boundaries.
1050 unsigned X86TargetLowering::getByValTypeAlignment(const Type *Ty) const {
1051   if (Subtarget->is64Bit()) {
1052     // Max of 8 and alignment of type.
1053     unsigned TyAlign = TD->getABITypeAlignment(Ty);
1054     if (TyAlign > 8)
1055       return TyAlign;
1056     return 8;
1057   }
1058
1059   unsigned Align = 4;
1060   if (Subtarget->hasSSE1())
1061     getMaxByValAlign(Ty, Align);
1062   return Align;
1063 }
1064
1065 /// getOptimalMemOpType - Returns the target specific optimal type for load
1066 /// and store operations as a result of memset, memcpy, and memmove
1067 /// lowering. It returns MVT::iAny if SelectionDAG should be responsible for
1068 /// determining it.
1069 EVT
1070 X86TargetLowering::getOptimalMemOpType(uint64_t Size, unsigned Align,
1071                                        bool isSrcConst, bool isSrcStr,
1072                                        SelectionDAG &DAG) const {
1073   // FIXME: This turns off use of xmm stores for memset/memcpy on targets like
1074   // linux.  This is because the stack realignment code can't handle certain
1075   // cases like PR2962.  This should be removed when PR2962 is fixed.
1076   const Function *F = DAG.getMachineFunction().getFunction();
1077   bool NoImplicitFloatOps = F->hasFnAttr(Attribute::NoImplicitFloat);
1078   if (!NoImplicitFloatOps && Subtarget->getStackAlignment() >= 16) {
1079     if ((isSrcConst || isSrcStr) && Subtarget->hasSSE2() && Size >= 16)
1080       return MVT::v4i32;
1081     if ((isSrcConst || isSrcStr) && Subtarget->hasSSE1() && Size >= 16)
1082       return MVT::v4f32;
1083   }
1084   if (Subtarget->is64Bit() && Size >= 8)
1085     return MVT::i64;
1086   return MVT::i32;
1087 }
1088
1089 /// getJumpTableEncoding - Return the entry encoding for a jump table in the
1090 /// current function.  The returned value is a member of the
1091 /// MachineJumpTableInfo::JTEntryKind enum.
1092 unsigned X86TargetLowering::getJumpTableEncoding() const {
1093   // In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF
1094   // symbol.
1095   if (getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
1096       Subtarget->isPICStyleGOT())
1097     return MachineJumpTableInfo::EK_Custom32;
1098
1099   // Otherwise, use the normal jump table encoding heuristics.
1100   return TargetLowering::getJumpTableEncoding();
1101 }
1102
1103 /// getPICBaseSymbol - Return the X86-32 PIC base.
1104 MCSymbol *
1105 X86TargetLowering::getPICBaseSymbol(const MachineFunction *MF,
1106                                     MCContext &Ctx) const {
1107   const MCAsmInfo &MAI = *getTargetMachine().getMCAsmInfo();
1108   return Ctx.GetOrCreateSymbol(Twine(MAI.getPrivateGlobalPrefix())+
1109                                Twine(MF->getFunctionNumber())+"$pb");
1110 }
1111
1112
1113 const MCExpr *
1114 X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
1115                                              const MachineBasicBlock *MBB,
1116                                              unsigned uid,MCContext &Ctx) const{
1117   assert(getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
1118          Subtarget->isPICStyleGOT());
1119   // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF
1120   // entries.
1121   return X86MCTargetExpr::Create(MBB->getSymbol(Ctx),
1122                                  X86MCTargetExpr::GOTOFF, Ctx);
1123 }
1124
1125 /// getPICJumpTableRelocaBase - Returns relocation base for the given PIC
1126 /// jumptable.
1127 SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table,
1128                                                     SelectionDAG &DAG) const {
1129   if (!Subtarget->is64Bit())
1130     // This doesn't have DebugLoc associated with it, but is not really the
1131     // same as a Register.
1132     return DAG.getNode(X86ISD::GlobalBaseReg, DebugLoc::getUnknownLoc(),
1133                        getPointerTy());
1134   return Table;
1135 }
1136
1137 /// getPICJumpTableRelocBaseExpr - This returns the relocation base for the
1138 /// given PIC jumptable, the same as getPICJumpTableRelocBase, but as an
1139 /// MCExpr.
1140 const MCExpr *X86TargetLowering::
1141 getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI,
1142                              MCContext &Ctx) const {
1143   // X86-64 uses RIP relative addressing based on the jump table label.
1144   if (Subtarget->isPICStyleRIPRel())
1145     return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);
1146
1147   // Otherwise, the reference is relative to the PIC base.
1148   return MCSymbolRefExpr::Create(getPICBaseSymbol(MF, Ctx), Ctx);
1149 }
1150
1151 /// getFunctionAlignment - Return the Log2 alignment of this function.
1152 unsigned X86TargetLowering::getFunctionAlignment(const Function *F) const {
1153   return F->hasFnAttr(Attribute::OptimizeForSize) ? 0 : 4;
1154 }
1155
1156 //===----------------------------------------------------------------------===//
1157 //               Return Value Calling Convention Implementation
1158 //===----------------------------------------------------------------------===//
1159
1160 #include "X86GenCallingConv.inc"
1161
1162 bool
1163 X86TargetLowering::CanLowerReturn(CallingConv::ID CallConv, bool isVarArg,
1164                         const SmallVectorImpl<EVT> &OutTys,
1165                         const SmallVectorImpl<ISD::ArgFlagsTy> &ArgsFlags,
1166                         SelectionDAG &DAG) {
1167   SmallVector<CCValAssign, 16> RVLocs;
1168   CCState CCInfo(CallConv, isVarArg, getTargetMachine(),
1169                  RVLocs, *DAG.getContext());
1170   return CCInfo.CheckReturn(OutTys, ArgsFlags, RetCC_X86);
1171 }
1172
1173 SDValue
1174 X86TargetLowering::LowerReturn(SDValue Chain,
1175                                CallingConv::ID CallConv, bool isVarArg,
1176                                const SmallVectorImpl<ISD::OutputArg> &Outs,
1177                                DebugLoc dl, SelectionDAG &DAG) {
1178
1179   SmallVector<CCValAssign, 16> RVLocs;
1180   CCState CCInfo(CallConv, isVarArg, getTargetMachine(),
1181                  RVLocs, *DAG.getContext());
1182   CCInfo.AnalyzeReturn(Outs, RetCC_X86);
1183
1184   // Add the regs to the liveout set for the function.
1185   MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
1186   for (unsigned i = 0; i != RVLocs.size(); ++i)
1187     if (RVLocs[i].isRegLoc() && !MRI.isLiveOut(RVLocs[i].getLocReg()))
1188       MRI.addLiveOut(RVLocs[i].getLocReg());
1189
1190   SDValue Flag;
1191
1192   SmallVector<SDValue, 6> RetOps;
1193   RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
1194   // Operand #1 = Bytes To Pop
1195   RetOps.push_back(DAG.getTargetConstant(getBytesToPopOnReturn(), MVT::i16));
1196
1197   // Copy the result values into the output registers.
1198   for (unsigned i = 0; i != RVLocs.size(); ++i) {
1199     CCValAssign &VA = RVLocs[i];
1200     assert(VA.isRegLoc() && "Can only return in registers!");
1201     SDValue ValToCopy = Outs[i].Val;
1202
1203     // Returns in ST0/ST1 are handled specially: these are pushed as operands to
1204     // the RET instruction and handled by the FP Stackifier.
1205     if (VA.getLocReg() == X86::ST0 ||
1206         VA.getLocReg() == X86::ST1) {
1207       // If this is a copy from an xmm register to ST(0), use an FPExtend to
1208       // change the value to the FP stack register class.
1209       if (isScalarFPTypeInSSEReg(VA.getValVT()))
1210         ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy);
1211       RetOps.push_back(ValToCopy);
1212       // Don't emit a copytoreg.
1213       continue;
1214     }
1215
1216     // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64
1217     // which is returned in RAX / RDX.
1218     if (Subtarget->is64Bit()) {
1219       EVT ValVT = ValToCopy.getValueType();
1220       if (ValVT.isVector() && ValVT.getSizeInBits() == 64) {
1221         ValToCopy = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i64, ValToCopy);
1222         if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1)
1223           ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, ValToCopy);
1224       }
1225     }
1226
1227     Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), ValToCopy, Flag);
1228     Flag = Chain.getValue(1);
1229   }
1230
1231   // The x86-64 ABI for returning structs by value requires that we copy
1232   // the sret argument into %rax for the return. We saved the argument into
1233   // a virtual register in the entry block, so now we copy the value out
1234   // and into %rax.
1235   if (Subtarget->is64Bit() &&
1236       DAG.getMachineFunction().getFunction()->hasStructRetAttr()) {
1237     MachineFunction &MF = DAG.getMachineFunction();
1238     X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
1239     unsigned Reg = FuncInfo->getSRetReturnReg();
1240     if (!Reg) {
1241       Reg = MRI.createVirtualRegister(getRegClassFor(MVT::i64));
1242       FuncInfo->setSRetReturnReg(Reg);
1243     }
1244     SDValue Val = DAG.getCopyFromReg(Chain, dl, Reg, getPointerTy());
1245
1246     Chain = DAG.getCopyToReg(Chain, dl, X86::RAX, Val, Flag);
1247     Flag = Chain.getValue(1);
1248
1249     // RAX now acts like a return value.
1250     MRI.addLiveOut(X86::RAX);
1251   }
1252
1253   RetOps[0] = Chain;  // Update chain.
1254
1255   // Add the flag if we have it.
1256   if (Flag.getNode())
1257     RetOps.push_back(Flag);
1258
1259   return DAG.getNode(X86ISD::RET_FLAG, dl,
1260                      MVT::Other, &RetOps[0], RetOps.size());
1261 }
1262
1263 /// LowerCallResult - Lower the result values of a call into the
1264 /// appropriate copies out of appropriate physical registers.
1265 ///
1266 SDValue
1267 X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
1268                                    CallingConv::ID CallConv, bool isVarArg,
1269                                    const SmallVectorImpl<ISD::InputArg> &Ins,
1270                                    DebugLoc dl, SelectionDAG &DAG,
1271                                    SmallVectorImpl<SDValue> &InVals) {
1272
1273   // Assign locations to each value returned by this call.
1274   SmallVector<CCValAssign, 16> RVLocs;
1275   bool Is64Bit = Subtarget->is64Bit();
1276   CCState CCInfo(CallConv, isVarArg, getTargetMachine(),
1277                  RVLocs, *DAG.getContext());
1278   CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
1279
1280   // Copy all of the result registers out of their specified physreg.
1281   for (unsigned i = 0; i != RVLocs.size(); ++i) {
1282     CCValAssign &VA = RVLocs[i];
1283     EVT CopyVT = VA.getValVT();
1284
1285     // If this is x86-64, and we disabled SSE, we can't return FP values
1286     if ((CopyVT == MVT::f32 || CopyVT == MVT::f64) &&
1287         ((Is64Bit || Ins[i].Flags.isInReg()) && !Subtarget->hasSSE1())) {
1288       llvm_report_error("SSE register return with SSE disabled");
1289     }
1290
1291     // If this is a call to a function that returns an fp value on the floating
1292     // point stack, but where we prefer to use the value in xmm registers, copy
1293     // it out as F80 and use a truncate to move it from fp stack reg to xmm reg.
1294     if ((VA.getLocReg() == X86::ST0 ||
1295          VA.getLocReg() == X86::ST1) &&
1296         isScalarFPTypeInSSEReg(VA.getValVT())) {
1297       CopyVT = MVT::f80;
1298     }
1299
1300     SDValue Val;
1301     if (Is64Bit && CopyVT.isVector() && CopyVT.getSizeInBits() == 64) {
1302       // For x86-64, MMX values are returned in XMM0 / XMM1 except for v1i64.
1303       if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) {
1304         Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(),
1305                                    MVT::v2i64, InFlag).getValue(1);
1306         Val = Chain.getValue(0);
1307         Val = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64,
1308                           Val, DAG.getConstant(0, MVT::i64));
1309       } else {
1310         Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(),
1311                                    MVT::i64, InFlag).getValue(1);
1312         Val = Chain.getValue(0);
1313       }
1314       Val = DAG.getNode(ISD::BIT_CONVERT, dl, CopyVT, Val);
1315     } else {
1316       Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(),
1317                                  CopyVT, InFlag).getValue(1);
1318       Val = Chain.getValue(0);
1319     }
1320     InFlag = Chain.getValue(2);
1321
1322     if (CopyVT != VA.getValVT()) {
1323       // Round the F80 the right size, which also moves to the appropriate xmm
1324       // register.
1325       Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val,
1326                         // This truncation won't change the value.
1327                         DAG.getIntPtrConstant(1));
1328     }
1329
1330     InVals.push_back(Val);
1331   }
1332
1333   return Chain;
1334 }
1335
1336
1337 //===----------------------------------------------------------------------===//
1338 //                C & StdCall & Fast Calling Convention implementation
1339 //===----------------------------------------------------------------------===//
1340 //  StdCall calling convention seems to be standard for many Windows' API
1341 //  routines and around. It differs from C calling convention just a little:
1342 //  callee should clean up the stack, not caller. Symbols should be also
1343 //  decorated in some fancy way :) It doesn't support any vector arguments.
1344 //  For info on fast calling convention see Fast Calling Convention (tail call)
1345 //  implementation LowerX86_32FastCCCallTo.
1346
1347 /// CallIsStructReturn - Determines whether a call uses struct return
1348 /// semantics.
1349 static bool CallIsStructReturn(const SmallVectorImpl<ISD::OutputArg> &Outs) {
1350   if (Outs.empty())
1351     return false;
1352
1353   return Outs[0].Flags.isSRet();
1354 }
1355
1356 /// ArgsAreStructReturn - Determines whether a function uses struct
1357 /// return semantics.
1358 static bool
1359 ArgsAreStructReturn(const SmallVectorImpl<ISD::InputArg> &Ins) {
1360   if (Ins.empty())
1361     return false;
1362
1363   return Ins[0].Flags.isSRet();
1364 }
1365
1366 /// IsCalleePop - Determines whether the callee is required to pop its
1367 /// own arguments. Callee pop is necessary to support tail calls.
1368 bool X86TargetLowering::IsCalleePop(bool IsVarArg, CallingConv::ID CallingConv){
1369   if (IsVarArg)
1370     return false;
1371
1372   switch (CallingConv) {
1373   default:
1374     return false;
1375   case CallingConv::X86_StdCall:
1376     return !Subtarget->is64Bit();
1377   case CallingConv::X86_FastCall:
1378     return !Subtarget->is64Bit();
1379   case CallingConv::Fast:
1380     return GuaranteedTailCallOpt;
1381   }
1382 }
1383
1384 /// CCAssignFnForNode - Selects the correct CCAssignFn for a the
1385 /// given CallingConvention value.
1386 CCAssignFn *X86TargetLowering::CCAssignFnForNode(CallingConv::ID CC) const {
1387   if (Subtarget->is64Bit()) {
1388     if (Subtarget->isTargetWin64())
1389       return CC_X86_Win64_C;
1390     else
1391       return CC_X86_64_C;
1392   }
1393
1394   if (CC == CallingConv::X86_FastCall)
1395     return CC_X86_32_FastCall;
1396   else if (CC == CallingConv::Fast)
1397     return CC_X86_32_FastCC;
1398   else
1399     return CC_X86_32_C;
1400 }
1401
1402 /// CreateCopyOfByValArgument - Make a copy of an aggregate at address specified
1403 /// by "Src" to address "Dst" with size and alignment information specified by
1404 /// the specific parameter attribute. The copy will be passed as a byval
1405 /// function parameter.
1406 static SDValue
1407 CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain,
1408                           ISD::ArgFlagsTy Flags, SelectionDAG &DAG,
1409                           DebugLoc dl) {
1410   SDValue SizeNode     = DAG.getConstant(Flags.getByValSize(), MVT::i32);
1411   return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(),
1412                        /*AlwaysInline=*/true, NULL, 0, NULL, 0);
1413 }
1414
1415 /// FuncIsMadeTailCallSafe - Return true if the function is being made into
1416 /// a tailcall target by changing its ABI.
1417 static bool FuncIsMadeTailCallSafe(CallingConv::ID CC) {
1418   return GuaranteedTailCallOpt && CC == CallingConv::Fast;
1419 }
1420
1421 SDValue
1422 X86TargetLowering::LowerMemArgument(SDValue Chain,
1423                                     CallingConv::ID CallConv,
1424                                     const SmallVectorImpl<ISD::InputArg> &Ins,
1425                                     DebugLoc dl, SelectionDAG &DAG,
1426                                     const CCValAssign &VA,
1427                                     MachineFrameInfo *MFI,
1428                                     unsigned i) {
1429   // Create the nodes corresponding to a load from this parameter slot.
1430   ISD::ArgFlagsTy Flags = Ins[i].Flags;
1431   bool AlwaysUseMutable = FuncIsMadeTailCallSafe(CallConv);
1432   bool isImmutable = !AlwaysUseMutable && !Flags.isByVal();
1433   EVT ValVT;
1434
1435   // If value is passed by pointer we have address passed instead of the value
1436   // itself.
1437   if (VA.getLocInfo() == CCValAssign::Indirect)
1438     ValVT = VA.getLocVT();
1439   else
1440     ValVT = VA.getValVT();
1441
1442   // FIXME: For now, all byval parameter objects are marked mutable. This can be
1443   // changed with more analysis.
1444   // In case of tail call optimization mark all arguments mutable. Since they
1445   // could be overwritten by lowering of arguments in case of a tail call.
1446   if (Flags.isByVal()) {
1447     int FI = MFI->CreateFixedObject(Flags.getByValSize(),
1448                                     VA.getLocMemOffset(), isImmutable, false);
1449     return DAG.getFrameIndex(FI, getPointerTy());
1450   } else {
1451     int FI = MFI->CreateFixedObject(ValVT.getSizeInBits()/8,
1452                                     VA.getLocMemOffset(), isImmutable, false);
1453     SDValue FIN = DAG.getFrameIndex(FI, getPointerTy());
1454     return DAG.getLoad(ValVT, dl, Chain, FIN,
1455                        PseudoSourceValue::getFixedStack(FI), 0,
1456                        false, false, 0);
1457   }
1458 }
1459
1460 SDValue
1461 X86TargetLowering::LowerFormalArguments(SDValue Chain,
1462                                         CallingConv::ID CallConv,
1463                                         bool isVarArg,
1464                                       const SmallVectorImpl<ISD::InputArg> &Ins,
1465                                         DebugLoc dl,
1466                                         SelectionDAG &DAG,
1467                                         SmallVectorImpl<SDValue> &InVals) {
1468
1469   MachineFunction &MF = DAG.getMachineFunction();
1470   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
1471
1472   const Function* Fn = MF.getFunction();
1473   if (Fn->hasExternalLinkage() &&
1474       Subtarget->isTargetCygMing() &&
1475       Fn->getName() == "main")
1476     FuncInfo->setForceFramePointer(true);
1477
1478   MachineFrameInfo *MFI = MF.getFrameInfo();
1479   bool Is64Bit = Subtarget->is64Bit();
1480   bool IsWin64 = Subtarget->isTargetWin64();
1481
1482   assert(!(isVarArg && CallConv == CallingConv::Fast) &&
1483          "Var args not supported with calling convention fastcc");
1484
1485   // Assign locations to all of the incoming arguments.
1486   SmallVector<CCValAssign, 16> ArgLocs;
1487   CCState CCInfo(CallConv, isVarArg, getTargetMachine(),
1488                  ArgLocs, *DAG.getContext());
1489   CCInfo.AnalyzeFormalArguments(Ins, CCAssignFnForNode(CallConv));
1490
1491   unsigned LastVal = ~0U;
1492   SDValue ArgValue;
1493   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
1494     CCValAssign &VA = ArgLocs[i];
1495     // TODO: If an arg is passed in two places (e.g. reg and stack), skip later
1496     // places.
1497     assert(VA.getValNo() != LastVal &&
1498            "Don't support value assigned to multiple locs yet");
1499     LastVal = VA.getValNo();
1500
1501     if (VA.isRegLoc()) {
1502       EVT RegVT = VA.getLocVT();
1503       TargetRegisterClass *RC = NULL;
1504       if (RegVT == MVT::i32)
1505         RC = X86::GR32RegisterClass;
1506       else if (Is64Bit && RegVT == MVT::i64)
1507         RC = X86::GR64RegisterClass;
1508       else if (RegVT == MVT::f32)
1509         RC = X86::FR32RegisterClass;
1510       else if (RegVT == MVT::f64)
1511         RC = X86::FR64RegisterClass;
1512       else if (RegVT.isVector() && RegVT.getSizeInBits() == 128)
1513         RC = X86::VR128RegisterClass;
1514       else if (RegVT.isVector() && RegVT.getSizeInBits() == 64)
1515         RC = X86::VR64RegisterClass;
1516       else
1517         llvm_unreachable("Unknown argument type!");
1518
1519       unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
1520       ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
1521
1522       // If this is an 8 or 16-bit value, it is really passed promoted to 32
1523       // bits.  Insert an assert[sz]ext to capture this, then truncate to the
1524       // right size.
1525       if (VA.getLocInfo() == CCValAssign::SExt)
1526         ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,
1527                                DAG.getValueType(VA.getValVT()));
1528       else if (VA.getLocInfo() == CCValAssign::ZExt)
1529         ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,
1530                                DAG.getValueType(VA.getValVT()));
1531       else if (VA.getLocInfo() == CCValAssign::BCvt)
1532         ArgValue = DAG.getNode(ISD::BIT_CONVERT, dl, VA.getValVT(), ArgValue);
1533
1534       if (VA.isExtInLoc()) {
1535         // Handle MMX values passed in XMM regs.
1536         if (RegVT.isVector()) {
1537           ArgValue = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64,
1538                                  ArgValue, DAG.getConstant(0, MVT::i64));
1539           ArgValue = DAG.getNode(ISD::BIT_CONVERT, dl, VA.getValVT(), ArgValue);
1540         } else
1541           ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
1542       }
1543     } else {
1544       assert(VA.isMemLoc());
1545       ArgValue = LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, i);
1546     }
1547
1548     // If value is passed via pointer - do a load.
1549     if (VA.getLocInfo() == CCValAssign::Indirect)
1550       ArgValue = DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue, NULL, 0,
1551                              false, false, 0);
1552
1553     InVals.push_back(ArgValue);
1554   }
1555
1556   // The x86-64 ABI for returning structs by value requires that we copy
1557   // the sret argument into %rax for the return. Save the argument into
1558   // a virtual register so that we can access it from the return points.
1559   if (Is64Bit && MF.getFunction()->hasStructRetAttr()) {
1560     X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
1561     unsigned Reg = FuncInfo->getSRetReturnReg();
1562     if (!Reg) {
1563       Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(MVT::i64));
1564       FuncInfo->setSRetReturnReg(Reg);
1565     }
1566     SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[0]);
1567     Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain);
1568   }
1569
1570   unsigned StackSize = CCInfo.getNextStackOffset();
1571   // Align stack specially for tail calls.
1572   if (FuncIsMadeTailCallSafe(CallConv))
1573     StackSize = GetAlignedArgumentStackSize(StackSize, DAG);
1574
1575   // If the function takes variable number of arguments, make a frame index for
1576   // the start of the first vararg value... for expansion of llvm.va_start.
1577   if (isVarArg) {
1578     if (Is64Bit || CallConv != CallingConv::X86_FastCall) {
1579       VarArgsFrameIndex = MFI->CreateFixedObject(1, StackSize, true, false);
1580     }
1581     if (Is64Bit) {
1582       unsigned TotalNumIntRegs = 0, TotalNumXMMRegs = 0;
1583
1584       // FIXME: We should really autogenerate these arrays
1585       static const unsigned GPR64ArgRegsWin64[] = {
1586         X86::RCX, X86::RDX, X86::R8,  X86::R9
1587       };
1588       static const unsigned XMMArgRegsWin64[] = {
1589         X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3
1590       };
1591       static const unsigned GPR64ArgRegs64Bit[] = {
1592         X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9
1593       };
1594       static const unsigned XMMArgRegs64Bit[] = {
1595         X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
1596         X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
1597       };
1598       const unsigned *GPR64ArgRegs, *XMMArgRegs;
1599
1600       if (IsWin64) {
1601         TotalNumIntRegs = 4; TotalNumXMMRegs = 4;
1602         GPR64ArgRegs = GPR64ArgRegsWin64;
1603         XMMArgRegs = XMMArgRegsWin64;
1604       } else {
1605         TotalNumIntRegs = 6; TotalNumXMMRegs = 8;
1606         GPR64ArgRegs = GPR64ArgRegs64Bit;
1607         XMMArgRegs = XMMArgRegs64Bit;
1608       }
1609       unsigned NumIntRegs = CCInfo.getFirstUnallocated(GPR64ArgRegs,
1610                                                        TotalNumIntRegs);
1611       unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs,
1612                                                        TotalNumXMMRegs);
1613
1614       bool NoImplicitFloatOps = Fn->hasFnAttr(Attribute::NoImplicitFloat);
1615       assert(!(NumXMMRegs && !Subtarget->hasSSE1()) &&
1616              "SSE register cannot be used when SSE is disabled!");
1617       assert(!(NumXMMRegs && UseSoftFloat && NoImplicitFloatOps) &&
1618              "SSE register cannot be used when SSE is disabled!");
1619       if (UseSoftFloat || NoImplicitFloatOps || !Subtarget->hasSSE1())
1620         // Kernel mode asks for SSE to be disabled, so don't push them
1621         // on the stack.
1622         TotalNumXMMRegs = 0;
1623
1624       // For X86-64, if there are vararg parameters that are passed via
1625       // registers, then we must store them to their spots on the stack so they
1626       // may be loaded by deferencing the result of va_next.
1627       VarArgsGPOffset = NumIntRegs * 8;
1628       VarArgsFPOffset = TotalNumIntRegs * 8 + NumXMMRegs * 16;
1629       RegSaveFrameIndex = MFI->CreateStackObject(TotalNumIntRegs * 8 +
1630                                                  TotalNumXMMRegs * 16, 16,
1631                                                  false);
1632
1633       // Store the integer parameter registers.
1634       SmallVector<SDValue, 8> MemOps;
1635       SDValue RSFIN = DAG.getFrameIndex(RegSaveFrameIndex, getPointerTy());
1636       unsigned Offset = VarArgsGPOffset;
1637       for (; NumIntRegs != TotalNumIntRegs; ++NumIntRegs) {
1638         SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), RSFIN,
1639                                   DAG.getIntPtrConstant(Offset));
1640         unsigned VReg = MF.addLiveIn(GPR64ArgRegs[NumIntRegs],
1641                                      X86::GR64RegisterClass);
1642         SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64);
1643         SDValue Store =
1644           DAG.getStore(Val.getValue(1), dl, Val, FIN,
1645                        PseudoSourceValue::getFixedStack(RegSaveFrameIndex),
1646                        Offset, false, false, 0);
1647         MemOps.push_back(Store);
1648         Offset += 8;
1649       }
1650
1651       if (TotalNumXMMRegs != 0 && NumXMMRegs != TotalNumXMMRegs) {
1652         // Now store the XMM (fp + vector) parameter registers.
1653         SmallVector<SDValue, 11> SaveXMMOps;
1654         SaveXMMOps.push_back(Chain);
1655
1656         unsigned AL = MF.addLiveIn(X86::AL, X86::GR8RegisterClass);
1657         SDValue ALVal = DAG.getCopyFromReg(DAG.getEntryNode(), dl, AL, MVT::i8);
1658         SaveXMMOps.push_back(ALVal);
1659
1660         SaveXMMOps.push_back(DAG.getIntPtrConstant(RegSaveFrameIndex));
1661         SaveXMMOps.push_back(DAG.getIntPtrConstant(VarArgsFPOffset));
1662
1663         for (; NumXMMRegs != TotalNumXMMRegs; ++NumXMMRegs) {
1664           unsigned VReg = MF.addLiveIn(XMMArgRegs[NumXMMRegs],
1665                                        X86::VR128RegisterClass);
1666           SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::v4f32);
1667           SaveXMMOps.push_back(Val);
1668         }
1669         MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl,
1670                                      MVT::Other,
1671                                      &SaveXMMOps[0], SaveXMMOps.size()));
1672       }
1673
1674       if (!MemOps.empty())
1675         Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
1676                             &MemOps[0], MemOps.size());
1677     }
1678   }
1679
1680   // Some CCs need callee pop.
1681   if (IsCalleePop(isVarArg, CallConv)) {
1682     BytesToPopOnReturn  = StackSize; // Callee pops everything.
1683   } else {
1684     BytesToPopOnReturn  = 0; // Callee pops nothing.
1685     // If this is an sret function, the return should pop the hidden pointer.
1686     if (!Is64Bit && CallConv != CallingConv::Fast && ArgsAreStructReturn(Ins))
1687       BytesToPopOnReturn = 4;
1688   }
1689
1690   if (!Is64Bit) {
1691     RegSaveFrameIndex = 0xAAAAAAA;   // RegSaveFrameIndex is X86-64 only.
1692     if (CallConv == CallingConv::X86_FastCall)
1693       VarArgsFrameIndex = 0xAAAAAAA;   // fastcc functions can't have varargs.
1694   }
1695
1696   FuncInfo->setBytesToPopOnReturn(BytesToPopOnReturn);
1697
1698   return Chain;
1699 }
1700
1701 SDValue
1702 X86TargetLowering::LowerMemOpCallTo(SDValue Chain,
1703                                     SDValue StackPtr, SDValue Arg,
1704                                     DebugLoc dl, SelectionDAG &DAG,
1705                                     const CCValAssign &VA,
1706                                     ISD::ArgFlagsTy Flags) {
1707   const unsigned FirstStackArgOffset = (Subtarget->isTargetWin64() ? 32 : 0);
1708   unsigned LocMemOffset = FirstStackArgOffset + VA.getLocMemOffset();
1709   SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset);
1710   PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, PtrOff);
1711   if (Flags.isByVal()) {
1712     return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl);
1713   }
1714   return DAG.getStore(Chain, dl, Arg, PtrOff,
1715                       PseudoSourceValue::getStack(), LocMemOffset,
1716                       false, false, 0);
1717 }
1718
1719 /// EmitTailCallLoadRetAddr - Emit a load of return address if tail call
1720 /// optimization is performed and it is required.
1721 SDValue
1722 X86TargetLowering::EmitTailCallLoadRetAddr(SelectionDAG &DAG,
1723                                            SDValue &OutRetAddr, SDValue Chain,
1724                                            bool IsTailCall, bool Is64Bit,
1725                                            int FPDiff, DebugLoc dl) {
1726   // Adjust the Return address stack slot.
1727   EVT VT = getPointerTy();
1728   OutRetAddr = getReturnAddressFrameIndex(DAG);
1729
1730   // Load the "old" Return address.
1731   OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, NULL, 0, false, false, 0);
1732   return SDValue(OutRetAddr.getNode(), 1);
1733 }
1734
1735 /// EmitTailCallStoreRetAddr - Emit a store of the return adress if tail call
1736 /// optimization is performed and it is required (FPDiff!=0).
1737 static SDValue
1738 EmitTailCallStoreRetAddr(SelectionDAG & DAG, MachineFunction &MF,
1739                          SDValue Chain, SDValue RetAddrFrIdx,
1740                          bool Is64Bit, int FPDiff, DebugLoc dl) {
1741   // Store the return address to the appropriate stack slot.
1742   if (!FPDiff) return Chain;
1743   // Calculate the new stack slot for the return address.
1744   int SlotSize = Is64Bit ? 8 : 4;
1745   int NewReturnAddrFI =
1746     MF.getFrameInfo()->CreateFixedObject(SlotSize, FPDiff-SlotSize, true,false);
1747   EVT VT = Is64Bit ? MVT::i64 : MVT::i32;
1748   SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, VT);
1749   Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx,
1750                        PseudoSourceValue::getFixedStack(NewReturnAddrFI), 0,
1751                        false, false, 0);
1752   return Chain;
1753 }
1754
1755 SDValue
1756 X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee,
1757                              CallingConv::ID CallConv, bool isVarArg,
1758                              bool &isTailCall,
1759                              const SmallVectorImpl<ISD::OutputArg> &Outs,
1760                              const SmallVectorImpl<ISD::InputArg> &Ins,
1761                              DebugLoc dl, SelectionDAG &DAG,
1762                              SmallVectorImpl<SDValue> &InVals) {
1763   MachineFunction &MF = DAG.getMachineFunction();
1764   bool Is64Bit        = Subtarget->is64Bit();
1765   bool IsStructRet    = CallIsStructReturn(Outs);
1766   bool IsSibcall      = false;
1767
1768   if (isTailCall) {
1769     // Check if it's really possible to do a tail call.
1770     isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv, isVarArg,
1771                                                    Outs, Ins, DAG);
1772
1773     // Sibcalls are automatically detected tailcalls which do not require
1774     // ABI changes.
1775     if (!GuaranteedTailCallOpt && isTailCall)
1776       IsSibcall = true;
1777
1778     if (isTailCall)
1779       ++NumTailCalls;
1780   }
1781
1782   assert(!(isVarArg && CallConv == CallingConv::Fast) &&
1783          "Var args not supported with calling convention fastcc");
1784
1785   // Analyze operands of the call, assigning locations to each operand.
1786   SmallVector<CCValAssign, 16> ArgLocs;
1787   CCState CCInfo(CallConv, isVarArg, getTargetMachine(),
1788                  ArgLocs, *DAG.getContext());
1789   CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForNode(CallConv));
1790
1791   // Get a count of how many bytes are to be pushed on the stack.
1792   unsigned NumBytes = CCInfo.getNextStackOffset();
1793   if (IsSibcall)
1794     // This is a sibcall. The memory operands are available in caller's
1795     // own caller's stack.
1796     NumBytes = 0;
1797   else if (GuaranteedTailCallOpt && CallConv == CallingConv::Fast)
1798     NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);
1799
1800   int FPDiff = 0;
1801   if (isTailCall && !IsSibcall) {
1802     // Lower arguments at fp - stackoffset + fpdiff.
1803     unsigned NumBytesCallerPushed =
1804       MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn();
1805     FPDiff = NumBytesCallerPushed - NumBytes;
1806
1807     // Set the delta of movement of the returnaddr stackslot.
1808     // But only set if delta is greater than previous delta.
1809     if (FPDiff < (MF.getInfo<X86MachineFunctionInfo>()->getTCReturnAddrDelta()))
1810       MF.getInfo<X86MachineFunctionInfo>()->setTCReturnAddrDelta(FPDiff);
1811   }
1812
1813   if (!IsSibcall)
1814     Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true));
1815
1816   SDValue RetAddrFrIdx;
1817   // Load return adress for tail calls.
1818   if (isTailCall && FPDiff)
1819     Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall,
1820                                     Is64Bit, FPDiff, dl);
1821
1822   SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
1823   SmallVector<SDValue, 8> MemOpChains;
1824   SDValue StackPtr;
1825
1826   // Walk the register/memloc assignments, inserting copies/loads.  In the case
1827   // of tail call optimization arguments are handle later.
1828   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
1829     CCValAssign &VA = ArgLocs[i];
1830     EVT RegVT = VA.getLocVT();
1831     SDValue Arg = Outs[i].Val;
1832     ISD::ArgFlagsTy Flags = Outs[i].Flags;
1833     bool isByVal = Flags.isByVal();
1834
1835     // Promote the value if needed.
1836     switch (VA.getLocInfo()) {
1837     default: llvm_unreachable("Unknown loc info!");
1838     case CCValAssign::Full: break;
1839     case CCValAssign::SExt:
1840       Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg);
1841       break;
1842     case CCValAssign::ZExt:
1843       Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg);
1844       break;
1845     case CCValAssign::AExt:
1846       if (RegVT.isVector() && RegVT.getSizeInBits() == 128) {
1847         // Special case: passing MMX values in XMM registers.
1848         Arg = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i64, Arg);
1849         Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg);
1850         Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg);
1851       } else
1852         Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg);
1853       break;
1854     case CCValAssign::BCvt:
1855       Arg = DAG.getNode(ISD::BIT_CONVERT, dl, RegVT, Arg);
1856       break;
1857     case CCValAssign::Indirect: {
1858       // Store the argument.
1859       SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT());
1860       int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
1861       Chain = DAG.getStore(Chain, dl, Arg, SpillSlot,
1862                            PseudoSourceValue::getFixedStack(FI), 0,
1863                            false, false, 0);
1864       Arg = SpillSlot;
1865       break;
1866     }
1867     }
1868
1869     if (VA.isRegLoc()) {
1870       RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
1871     } else if (!IsSibcall && (!isTailCall || isByVal)) {
1872       assert(VA.isMemLoc());
1873       if (StackPtr.getNode() == 0)
1874         StackPtr = DAG.getCopyFromReg(Chain, dl, X86StackPtr, getPointerTy());
1875       MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
1876                                              dl, DAG, VA, Flags));
1877     }
1878   }
1879
1880   if (!MemOpChains.empty())
1881     Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
1882                         &MemOpChains[0], MemOpChains.size());
1883
1884   // Build a sequence of copy-to-reg nodes chained together with token chain
1885   // and flag operands which copy the outgoing args into registers.
1886   SDValue InFlag;
1887   // Tail call byval lowering might overwrite argument registers so in case of
1888   // tail call optimization the copies to registers are lowered later.
1889   if (!isTailCall)
1890     for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
1891       Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
1892                                RegsToPass[i].second, InFlag);
1893       InFlag = Chain.getValue(1);
1894     }
1895
1896   if (Subtarget->isPICStyleGOT()) {
1897     // ELF / PIC requires GOT in the EBX register before function calls via PLT
1898     // GOT pointer.
1899     if (!isTailCall) {
1900       Chain = DAG.getCopyToReg(Chain, dl, X86::EBX,
1901                                DAG.getNode(X86ISD::GlobalBaseReg,
1902                                            DebugLoc::getUnknownLoc(),
1903                                            getPointerTy()),
1904                                InFlag);
1905       InFlag = Chain.getValue(1);
1906     } else {
1907       // If we are tail calling and generating PIC/GOT style code load the
1908       // address of the callee into ECX. The value in ecx is used as target of
1909       // the tail jump. This is done to circumvent the ebx/callee-saved problem
1910       // for tail calls on PIC/GOT architectures. Normally we would just put the
1911       // address of GOT into ebx and then call target@PLT. But for tail calls
1912       // ebx would be restored (since ebx is callee saved) before jumping to the
1913       // target@PLT.
1914
1915       // Note: The actual moving to ECX is done further down.
1916       GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
1917       if (G && !G->getGlobal()->hasHiddenVisibility() &&
1918           !G->getGlobal()->hasProtectedVisibility())
1919         Callee = LowerGlobalAddress(Callee, DAG);
1920       else if (isa<ExternalSymbolSDNode>(Callee))
1921         Callee = LowerExternalSymbol(Callee, DAG);
1922     }
1923   }
1924
1925   if (Is64Bit && isVarArg) {
1926     // From AMD64 ABI document:
1927     // For calls that may call functions that use varargs or stdargs
1928     // (prototype-less calls or calls to functions containing ellipsis (...) in
1929     // the declaration) %al is used as hidden argument to specify the number
1930     // of SSE registers used. The contents of %al do not need to match exactly
1931     // the number of registers, but must be an ubound on the number of SSE
1932     // registers used and is in the range 0 - 8 inclusive.
1933
1934     // FIXME: Verify this on Win64
1935     // Count the number of XMM registers allocated.
1936     static const unsigned XMMArgRegs[] = {
1937       X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
1938       X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
1939     };
1940     unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs, 8);
1941     assert((Subtarget->hasSSE1() || !NumXMMRegs)
1942            && "SSE registers cannot be used when SSE is disabled");
1943
1944     Chain = DAG.getCopyToReg(Chain, dl, X86::AL,
1945                              DAG.getConstant(NumXMMRegs, MVT::i8), InFlag);
1946     InFlag = Chain.getValue(1);
1947   }
1948
1949
1950   // For tail calls lower the arguments to the 'real' stack slot.
1951   if (isTailCall) {
1952     // Force all the incoming stack arguments to be loaded from the stack
1953     // before any new outgoing arguments are stored to the stack, because the
1954     // outgoing stack slots may alias the incoming argument stack slots, and
1955     // the alias isn't otherwise explicit. This is slightly more conservative
1956     // than necessary, because it means that each store effectively depends
1957     // on every argument instead of just those arguments it would clobber.
1958     SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain);
1959
1960     SmallVector<SDValue, 8> MemOpChains2;
1961     SDValue FIN;
1962     int FI = 0;
1963     // Do not flag preceeding copytoreg stuff together with the following stuff.
1964     InFlag = SDValue();
1965     if (GuaranteedTailCallOpt) {
1966       for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
1967         CCValAssign &VA = ArgLocs[i];
1968         if (VA.isRegLoc())
1969           continue;
1970         assert(VA.isMemLoc());
1971         SDValue Arg = Outs[i].Val;
1972         ISD::ArgFlagsTy Flags = Outs[i].Flags;
1973         // Create frame index.
1974         int32_t Offset = VA.getLocMemOffset()+FPDiff;
1975         uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8;
1976         FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true, false);
1977         FIN = DAG.getFrameIndex(FI, getPointerTy());
1978
1979         if (Flags.isByVal()) {
1980           // Copy relative to framepointer.
1981           SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset());
1982           if (StackPtr.getNode() == 0)
1983             StackPtr = DAG.getCopyFromReg(Chain, dl, X86StackPtr,
1984                                           getPointerTy());
1985           Source = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, Source);
1986
1987           MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN,
1988                                                            ArgChain,
1989                                                            Flags, DAG, dl));
1990         } else {
1991           // Store relative to framepointer.
1992           MemOpChains2.push_back(
1993             DAG.getStore(ArgChain, dl, Arg, FIN,
1994                          PseudoSourceValue::getFixedStack(FI), 0,
1995                          false, false, 0));
1996         }
1997       }
1998     }
1999
2000     if (!MemOpChains2.empty())
2001       Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
2002                           &MemOpChains2[0], MemOpChains2.size());
2003
2004     // Copy arguments to their registers.
2005     for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
2006       Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
2007                                RegsToPass[i].second, InFlag);
2008       InFlag = Chain.getValue(1);
2009     }
2010     InFlag =SDValue();
2011
2012     // Store the return address to the appropriate stack slot.
2013     Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx, Is64Bit,
2014                                      FPDiff, dl);
2015   }
2016
2017   bool WasGlobalOrExternal = false;
2018   if (getTargetMachine().getCodeModel() == CodeModel::Large) {
2019     assert(Is64Bit && "Large code model is only legal in 64-bit mode.");
2020     // In the 64-bit large code model, we have to make all calls
2021     // through a register, since the call instruction's 32-bit
2022     // pc-relative offset may not be large enough to hold the whole
2023     // address.
2024   } else if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
2025     WasGlobalOrExternal = true;
2026     // If the callee is a GlobalAddress node (quite common, every direct call
2027     // is) turn it into a TargetGlobalAddress node so that legalize doesn't hack
2028     // it.
2029
2030     // We should use extra load for direct calls to dllimported functions in
2031     // non-JIT mode.
2032     GlobalValue *GV = G->getGlobal();
2033     if (!GV->hasDLLImportLinkage()) {
2034       unsigned char OpFlags = 0;
2035
2036       // On ELF targets, in both X86-64 and X86-32 mode, direct calls to
2037       // external symbols most go through the PLT in PIC mode.  If the symbol
2038       // has hidden or protected visibility, or if it is static or local, then
2039       // we don't need to use the PLT - we can directly call it.
2040       if (Subtarget->isTargetELF() &&
2041           getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
2042           GV->hasDefaultVisibility() && !GV->hasLocalLinkage()) {
2043         OpFlags = X86II::MO_PLT;
2044       } else if (Subtarget->isPICStyleStubAny() &&
2045                (GV->isDeclaration() || GV->isWeakForLinker()) &&
2046                Subtarget->getDarwinVers() < 9) {
2047         // PC-relative references to external symbols should go through $stub,
2048         // unless we're building with the leopard linker or later, which
2049         // automatically synthesizes these stubs.
2050         OpFlags = X86II::MO_DARWIN_STUB;
2051       }
2052
2053       Callee = DAG.getTargetGlobalAddress(GV, getPointerTy(),
2054                                           G->getOffset(), OpFlags);
2055     }
2056   } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
2057     WasGlobalOrExternal = true;
2058     unsigned char OpFlags = 0;
2059
2060     // On ELF targets, in either X86-64 or X86-32 mode, direct calls to external
2061     // symbols should go through the PLT.
2062     if (Subtarget->isTargetELF() &&
2063         getTargetMachine().getRelocationModel() == Reloc::PIC_) {
2064       OpFlags = X86II::MO_PLT;
2065     } else if (Subtarget->isPICStyleStubAny() &&
2066              Subtarget->getDarwinVers() < 9) {
2067       // PC-relative references to external symbols should go through $stub,
2068       // unless we're building with the leopard linker or later, which
2069       // automatically synthesizes these stubs.
2070       OpFlags = X86II::MO_DARWIN_STUB;
2071     }
2072
2073     Callee = DAG.getTargetExternalSymbol(S->getSymbol(), getPointerTy(),
2074                                          OpFlags);
2075   }
2076
2077   if (isTailCall && !WasGlobalOrExternal) {
2078     // Force the address into a (call preserved) caller-saved register since
2079     // tailcall must happen after callee-saved registers are poped.
2080     // FIXME: Give it a special register class that contains caller-saved
2081     // register instead?
2082     unsigned TCReg = Is64Bit ? X86::R11 : X86::EAX;
2083     Chain = DAG.getCopyToReg(Chain,  dl,
2084                              DAG.getRegister(TCReg, getPointerTy()),
2085                              Callee,InFlag);
2086     Callee = DAG.getRegister(TCReg, getPointerTy());
2087   }
2088
2089   // Returns a chain & a flag for retval copy to use.
2090   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Flag);
2091   SmallVector<SDValue, 8> Ops;
2092
2093   if (!IsSibcall && isTailCall) {
2094     Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true),
2095                            DAG.getIntPtrConstant(0, true), InFlag);
2096     InFlag = Chain.getValue(1);
2097   }
2098
2099   Ops.push_back(Chain);
2100   Ops.push_back(Callee);
2101
2102   if (isTailCall)
2103     Ops.push_back(DAG.getConstant(FPDiff, MVT::i32));
2104
2105   // Add argument registers to the end of the list so that they are known live
2106   // into the call.
2107   for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
2108     Ops.push_back(DAG.getRegister(RegsToPass[i].first,
2109                                   RegsToPass[i].second.getValueType()));
2110
2111   // Add an implicit use GOT pointer in EBX.
2112   if (!isTailCall && Subtarget->isPICStyleGOT())
2113     Ops.push_back(DAG.getRegister(X86::EBX, getPointerTy()));
2114
2115   // Add an implicit use of AL for x86 vararg functions.
2116   if (Is64Bit && isVarArg)
2117     Ops.push_back(DAG.getRegister(X86::AL, MVT::i8));
2118
2119   if (InFlag.getNode())
2120     Ops.push_back(InFlag);
2121
2122   if (isTailCall) {
2123     // If this is the first return lowered for this function, add the regs
2124     // to the liveout set for the function.
2125     if (MF.getRegInfo().liveout_empty()) {
2126       SmallVector<CCValAssign, 16> RVLocs;
2127       CCState CCInfo(CallConv, isVarArg, getTargetMachine(), RVLocs,
2128                      *DAG.getContext());
2129       CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
2130       for (unsigned i = 0; i != RVLocs.size(); ++i)
2131         if (RVLocs[i].isRegLoc())
2132           MF.getRegInfo().addLiveOut(RVLocs[i].getLocReg());
2133     }
2134
2135     assert(((Callee.getOpcode() == ISD::Register &&
2136                (cast<RegisterSDNode>(Callee)->getReg() == X86::EAX ||
2137                 cast<RegisterSDNode>(Callee)->getReg() == X86::R11)) ||
2138               Callee.getOpcode() == ISD::TargetExternalSymbol ||
2139               Callee.getOpcode() == ISD::TargetGlobalAddress) &&
2140            "Expecting a global address, external symbol, or scratch register");
2141
2142     return DAG.getNode(X86ISD::TC_RETURN, dl,
2143                        NodeTys, &Ops[0], Ops.size());
2144   }
2145
2146   Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, &Ops[0], Ops.size());
2147   InFlag = Chain.getValue(1);
2148
2149   // Create the CALLSEQ_END node.
2150   unsigned NumBytesForCalleeToPush;
2151   if (IsCalleePop(isVarArg, CallConv))
2152     NumBytesForCalleeToPush = NumBytes;    // Callee pops everything
2153   else if (!Is64Bit && CallConv != CallingConv::Fast && IsStructRet)
2154     // If this is a call to a struct-return function, the callee
2155     // pops the hidden struct pointer, so we have to push it back.
2156     // This is common for Darwin/X86, Linux & Mingw32 targets.
2157     NumBytesForCalleeToPush = 4;
2158   else
2159     NumBytesForCalleeToPush = 0;  // Callee pops nothing.
2160
2161   // Returns a flag for retval copy to use.
2162   if (!IsSibcall) {
2163     Chain = DAG.getCALLSEQ_END(Chain,
2164                                DAG.getIntPtrConstant(NumBytes, true),
2165                                DAG.getIntPtrConstant(NumBytesForCalleeToPush,
2166                                                      true),
2167                                InFlag);
2168     InFlag = Chain.getValue(1);
2169   }
2170
2171   // Handle result values, copying them out of physregs into vregs that we
2172   // return.
2173   return LowerCallResult(Chain, InFlag, CallConv, isVarArg,
2174                          Ins, dl, DAG, InVals);
2175 }
2176
2177
2178 //===----------------------------------------------------------------------===//
2179 //                Fast Calling Convention (tail call) implementation
2180 //===----------------------------------------------------------------------===//
2181
2182 //  Like std call, callee cleans arguments, convention except that ECX is
2183 //  reserved for storing the tail called function address. Only 2 registers are
2184 //  free for argument passing (inreg). Tail call optimization is performed
2185 //  provided:
2186 //                * tailcallopt is enabled
2187 //                * caller/callee are fastcc
2188 //  On X86_64 architecture with GOT-style position independent code only local
2189 //  (within module) calls are supported at the moment.
2190 //  To keep the stack aligned according to platform abi the function
2191 //  GetAlignedArgumentStackSize ensures that argument delta is always multiples
2192 //  of stack alignment. (Dynamic linkers need this - darwin's dyld for example)
2193 //  If a tail called function callee has more arguments than the caller the
2194 //  caller needs to make sure that there is room to move the RETADDR to. This is
2195 //  achieved by reserving an area the size of the argument delta right after the
2196 //  original REtADDR, but before the saved framepointer or the spilled registers
2197 //  e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4)
2198 //  stack layout:
2199 //    arg1
2200 //    arg2
2201 //    RETADDR
2202 //    [ new RETADDR
2203 //      move area ]
2204 //    (possible EBP)
2205 //    ESI
2206 //    EDI
2207 //    local1 ..
2208
2209 /// GetAlignedArgumentStackSize - Make the stack size align e.g 16n + 12 aligned
2210 /// for a 16 byte align requirement.
2211 unsigned X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize,
2212                                                         SelectionDAG& DAG) {
2213   MachineFunction &MF = DAG.getMachineFunction();
2214   const TargetMachine &TM = MF.getTarget();
2215   const TargetFrameInfo &TFI = *TM.getFrameInfo();
2216   unsigned StackAlignment = TFI.getStackAlignment();
2217   uint64_t AlignMask = StackAlignment - 1;
2218   int64_t Offset = StackSize;
2219   uint64_t SlotSize = TD->getPointerSize();
2220   if ( (Offset & AlignMask) <= (StackAlignment - SlotSize) ) {
2221     // Number smaller than 12 so just add the difference.
2222     Offset += ((StackAlignment - SlotSize) - (Offset & AlignMask));
2223   } else {
2224     // Mask out lower bits, add stackalignment once plus the 12 bytes.
2225     Offset = ((~AlignMask) & Offset) + StackAlignment +
2226       (StackAlignment-SlotSize);
2227   }
2228   return Offset;
2229 }
2230
2231 /// MatchingStackOffset - Return true if the given stack call argument is
2232 /// already available in the same position (relatively) of the caller's
2233 /// incoming argument stack.
2234 static
2235 bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
2236                          MachineFrameInfo *MFI, const MachineRegisterInfo *MRI,
2237                          const X86InstrInfo *TII) {
2238   int FI;
2239   if (Arg.getOpcode() == ISD::CopyFromReg) {
2240     unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
2241     if (!VR || TargetRegisterInfo::isPhysicalRegister(VR))
2242       return false;
2243     MachineInstr *Def = MRI->getVRegDef(VR);
2244     if (!Def)
2245       return false;
2246     if (!Flags.isByVal()) {
2247       if (!TII->isLoadFromStackSlot(Def, FI))
2248         return false;
2249     } else {
2250       unsigned Opcode = Def->getOpcode();
2251       if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r) &&
2252           Def->getOperand(1).isFI()) {
2253         FI = Def->getOperand(1).getIndex();
2254         if (MFI->getObjectSize(FI) != Flags.getByValSize())
2255           return false;
2256       } else
2257         return false;
2258     }
2259   } else {
2260     LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg);
2261     if (!Ld)
2262       return false;
2263     SDValue Ptr = Ld->getBasePtr();
2264     FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr);
2265     if (!FINode)
2266       return false;
2267     FI = FINode->getIndex();
2268   }
2269
2270   if (!MFI->isFixedObjectIndex(FI))
2271     return false;
2272   return Offset == MFI->getObjectOffset(FI);
2273 }
2274
2275 /// IsEligibleForTailCallOptimization - Check whether the call is eligible
2276 /// for tail call optimization. Targets which want to do tail call
2277 /// optimization should implement this function.
2278 bool
2279 X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
2280                                                      CallingConv::ID CalleeCC,
2281                                                      bool isVarArg,
2282                                     const SmallVectorImpl<ISD::OutputArg> &Outs,
2283                                     const SmallVectorImpl<ISD::InputArg> &Ins,
2284                                                      SelectionDAG& DAG) const {
2285   if (CalleeCC != CallingConv::Fast &&
2286       CalleeCC != CallingConv::C)
2287     return false;
2288
2289   // If -tailcallopt is specified, make fastcc functions tail-callable.
2290   const Function *CallerF = DAG.getMachineFunction().getFunction();
2291   if (GuaranteedTailCallOpt) {
2292     if (CalleeCC == CallingConv::Fast &&
2293         CallerF->getCallingConv() == CalleeCC)
2294       return true;
2295     return false;
2296   }
2297
2298   // Look for obvious safe cases to perform tail call optimization that does not
2299   // requite ABI changes. This is what gcc calls sibcall.
2300
2301   // Do not tail call optimize vararg calls for now.
2302   if (isVarArg)
2303     return false;
2304
2305   // If the callee takes no arguments then go on to check the results of the
2306   // call.
2307   if (!Outs.empty()) {
2308     // Check if stack adjustment is needed. For now, do not do this if any
2309     // argument is passed on the stack.
2310     SmallVector<CCValAssign, 16> ArgLocs;
2311     CCState CCInfo(CalleeCC, isVarArg, getTargetMachine(),
2312                    ArgLocs, *DAG.getContext());
2313     CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForNode(CalleeCC));
2314     if (CCInfo.getNextStackOffset()) {
2315       MachineFunction &MF = DAG.getMachineFunction();
2316       if (MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn())
2317         return false;
2318       if (Subtarget->isTargetWin64())
2319         // Win64 ABI has additional complications.
2320         return false;
2321
2322       // Check if the arguments are already laid out in the right way as
2323       // the caller's fixed stack objects.
2324       MachineFrameInfo *MFI = MF.getFrameInfo();
2325       const MachineRegisterInfo *MRI = &MF.getRegInfo();
2326       const X86InstrInfo *TII =
2327         ((X86TargetMachine&)getTargetMachine()).getInstrInfo();
2328       for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
2329         CCValAssign &VA = ArgLocs[i];
2330         EVT RegVT = VA.getLocVT();
2331         SDValue Arg = Outs[i].Val;
2332         ISD::ArgFlagsTy Flags = Outs[i].Flags;
2333         if (VA.getLocInfo() == CCValAssign::Indirect)
2334           return false;
2335         if (!VA.isRegLoc()) {
2336           if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags,
2337                                    MFI, MRI, TII))
2338             return false;
2339         }
2340       }
2341     }
2342   }
2343
2344   return true;
2345 }
2346
2347 FastISel *
2348 X86TargetLowering::createFastISel(MachineFunction &mf, MachineModuleInfo *mmo,
2349                             DwarfWriter *dw,
2350                             DenseMap<const Value *, unsigned> &vm,
2351                             DenseMap<const BasicBlock*, MachineBasicBlock*> &bm,
2352                             DenseMap<const AllocaInst *, int> &am
2353 #ifndef NDEBUG
2354                           , SmallSet<Instruction*, 8> &cil
2355 #endif
2356                                   ) {
2357   return X86::createFastISel(mf, mmo, dw, vm, bm, am
2358 #ifndef NDEBUG
2359                              , cil
2360 #endif
2361                              );
2362 }
2363
2364
2365 //===----------------------------------------------------------------------===//
2366 //                           Other Lowering Hooks
2367 //===----------------------------------------------------------------------===//
2368
2369
2370 SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) {
2371   MachineFunction &MF = DAG.getMachineFunction();
2372   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
2373   int ReturnAddrIndex = FuncInfo->getRAIndex();
2374
2375   if (ReturnAddrIndex == 0) {
2376     // Set up a frame object for the return address.
2377     uint64_t SlotSize = TD->getPointerSize();
2378     ReturnAddrIndex = MF.getFrameInfo()->CreateFixedObject(SlotSize, -SlotSize,
2379                                                            true, false);
2380     FuncInfo->setRAIndex(ReturnAddrIndex);
2381   }
2382
2383   return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy());
2384 }
2385
2386
2387 bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M,
2388                                        bool hasSymbolicDisplacement) {
2389   // Offset should fit into 32 bit immediate field.
2390   if (!isInt32(Offset))
2391     return false;
2392
2393   // If we don't have a symbolic displacement - we don't have any extra
2394   // restrictions.
2395   if (!hasSymbolicDisplacement)
2396     return true;
2397
2398   // FIXME: Some tweaks might be needed for medium code model.
2399   if (M != CodeModel::Small && M != CodeModel::Kernel)
2400     return false;
2401
2402   // For small code model we assume that latest object is 16MB before end of 31
2403   // bits boundary. We may also accept pretty large negative constants knowing
2404   // that all objects are in the positive half of address space.
2405   if (M == CodeModel::Small && Offset < 16*1024*1024)
2406     return true;
2407
2408   // For kernel code model we know that all object resist in the negative half
2409   // of 32bits address space. We may not accept negative offsets, since they may
2410   // be just off and we may accept pretty large positive ones.
2411   if (M == CodeModel::Kernel && Offset > 0)
2412     return true;
2413
2414   return false;
2415 }
2416
2417 /// TranslateX86CC - do a one to one translation of a ISD::CondCode to the X86
2418 /// specific condition code, returning the condition code and the LHS/RHS of the
2419 /// comparison to make.
2420 static unsigned TranslateX86CC(ISD::CondCode SetCCOpcode, bool isFP,
2421                                SDValue &LHS, SDValue &RHS, SelectionDAG &DAG) {
2422   if (!isFP) {
2423     if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {
2424       if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) {
2425         // X > -1   -> X == 0, jump !sign.
2426         RHS = DAG.getConstant(0, RHS.getValueType());
2427         return X86::COND_NS;
2428       } else if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) {
2429         // X < 0   -> X == 0, jump on sign.
2430         return X86::COND_S;
2431       } else if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) {
2432         // X < 1   -> X <= 0
2433         RHS = DAG.getConstant(0, RHS.getValueType());
2434         return X86::COND_LE;
2435       }
2436     }
2437
2438     switch (SetCCOpcode) {
2439     default: llvm_unreachable("Invalid integer condition!");
2440     case ISD::SETEQ:  return X86::COND_E;
2441     case ISD::SETGT:  return X86::COND_G;
2442     case ISD::SETGE:  return X86::COND_GE;
2443     case ISD::SETLT:  return X86::COND_L;
2444     case ISD::SETLE:  return X86::COND_LE;
2445     case ISD::SETNE:  return X86::COND_NE;
2446     case ISD::SETULT: return X86::COND_B;
2447     case ISD::SETUGT: return X86::COND_A;
2448     case ISD::SETULE: return X86::COND_BE;
2449     case ISD::SETUGE: return X86::COND_AE;
2450     }
2451   }
2452
2453   // First determine if it is required or is profitable to flip the operands.
2454
2455   // If LHS is a foldable load, but RHS is not, flip the condition.
2456   if ((ISD::isNON_EXTLoad(LHS.getNode()) && LHS.hasOneUse()) &&
2457       !(ISD::isNON_EXTLoad(RHS.getNode()) && RHS.hasOneUse())) {
2458     SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);
2459     std::swap(LHS, RHS);
2460   }
2461
2462   switch (SetCCOpcode) {
2463   default: break;
2464   case ISD::SETOLT:
2465   case ISD::SETOLE:
2466   case ISD::SETUGT:
2467   case ISD::SETUGE:
2468     std::swap(LHS, RHS);
2469     break;
2470   }
2471
2472   // On a floating point condition, the flags are set as follows:
2473   // ZF  PF  CF   op
2474   //  0 | 0 | 0 | X > Y
2475   //  0 | 0 | 1 | X < Y
2476   //  1 | 0 | 0 | X == Y
2477   //  1 | 1 | 1 | unordered
2478   switch (SetCCOpcode) {
2479   default: llvm_unreachable("Condcode should be pre-legalized away");
2480   case ISD::SETUEQ:
2481   case ISD::SETEQ:   return X86::COND_E;
2482   case ISD::SETOLT:              // flipped
2483   case ISD::SETOGT:
2484   case ISD::SETGT:   return X86::COND_A;
2485   case ISD::SETOLE:              // flipped
2486   case ISD::SETOGE:
2487   case ISD::SETGE:   return X86::COND_AE;
2488   case ISD::SETUGT:              // flipped
2489   case ISD::SETULT:
2490   case ISD::SETLT:   return X86::COND_B;
2491   case ISD::SETUGE:              // flipped
2492   case ISD::SETULE:
2493   case ISD::SETLE:   return X86::COND_BE;
2494   case ISD::SETONE:
2495   case ISD::SETNE:   return X86::COND_NE;
2496   case ISD::SETUO:   return X86::COND_P;
2497   case ISD::SETO:    return X86::COND_NP;
2498   case ISD::SETOEQ:
2499   case ISD::SETUNE:  return X86::COND_INVALID;
2500   }
2501 }
2502
2503 /// hasFPCMov - is there a floating point cmov for the specific X86 condition
2504 /// code. Current x86 isa includes the following FP cmov instructions:
2505 /// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu.
2506 static bool hasFPCMov(unsigned X86CC) {
2507   switch (X86CC) {
2508   default:
2509     return false;
2510   case X86::COND_B:
2511   case X86::COND_BE:
2512   case X86::COND_E:
2513   case X86::COND_P:
2514   case X86::COND_A:
2515   case X86::COND_AE:
2516   case X86::COND_NE:
2517   case X86::COND_NP:
2518     return true;
2519   }
2520 }
2521
2522 /// isFPImmLegal - Returns true if the target can instruction select the
2523 /// specified FP immediate natively. If false, the legalizer will
2524 /// materialize the FP immediate as a load from a constant pool.
2525 bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
2526   for (unsigned i = 0, e = LegalFPImmediates.size(); i != e; ++i) {
2527     if (Imm.bitwiseIsEqual(LegalFPImmediates[i]))
2528       return true;
2529   }
2530   return false;
2531 }
2532
2533 /// isUndefOrInRange - Return true if Val is undef or if its value falls within
2534 /// the specified range (L, H].
2535 static bool isUndefOrInRange(int Val, int Low, int Hi) {
2536   return (Val < 0) || (Val >= Low && Val < Hi);
2537 }
2538
2539 /// isUndefOrEqual - Val is either less than zero (undef) or equal to the
2540 /// specified value.
2541 static bool isUndefOrEqual(int Val, int CmpVal) {
2542   if (Val < 0 || Val == CmpVal)
2543     return true;
2544   return false;
2545 }
2546
2547 /// isPSHUFDMask - Return true if the node specifies a shuffle of elements that
2548 /// is suitable for input to PSHUFD or PSHUFW.  That is, it doesn't reference
2549 /// the second operand.
2550 static bool isPSHUFDMask(const SmallVectorImpl<int> &Mask, EVT VT) {
2551   if (VT == MVT::v4f32 || VT == MVT::v4i32 || VT == MVT::v4i16)
2552     return (Mask[0] < 4 && Mask[1] < 4 && Mask[2] < 4 && Mask[3] < 4);
2553   if (VT == MVT::v2f64 || VT == MVT::v2i64)
2554     return (Mask[0] < 2 && Mask[1] < 2);
2555   return false;
2556 }
2557
2558 bool X86::isPSHUFDMask(ShuffleVectorSDNode *N) {
2559   SmallVector<int, 8> M;
2560   N->getMask(M);
2561   return ::isPSHUFDMask(M, N->getValueType(0));
2562 }
2563
2564 /// isPSHUFHWMask - Return true if the node specifies a shuffle of elements that
2565 /// is suitable for input to PSHUFHW.
2566 static bool isPSHUFHWMask(const SmallVectorImpl<int> &Mask, EVT VT) {
2567   if (VT != MVT::v8i16)
2568     return false;
2569
2570   // Lower quadword copied in order or undef.
2571   for (int i = 0; i != 4; ++i)
2572     if (Mask[i] >= 0 && Mask[i] != i)
2573       return false;
2574
2575   // Upper quadword shuffled.
2576   for (int i = 4; i != 8; ++i)
2577     if (Mask[i] >= 0 && (Mask[i] < 4 || Mask[i] > 7))
2578       return false;
2579
2580   return true;
2581 }
2582
2583 bool X86::isPSHUFHWMask(ShuffleVectorSDNode *N) {
2584   SmallVector<int, 8> M;
2585   N->getMask(M);
2586   return ::isPSHUFHWMask(M, N->getValueType(0));
2587 }
2588
2589 /// isPSHUFLWMask - Return true if the node specifies a shuffle of elements that
2590 /// is suitable for input to PSHUFLW.
2591 static bool isPSHUFLWMask(const SmallVectorImpl<int> &Mask, EVT VT) {
2592   if (VT != MVT::v8i16)
2593     return false;
2594
2595   // Upper quadword copied in order.
2596   for (int i = 4; i != 8; ++i)
2597     if (Mask[i] >= 0 && Mask[i] != i)
2598       return false;
2599
2600   // Lower quadword shuffled.
2601   for (int i = 0; i != 4; ++i)
2602     if (Mask[i] >= 4)
2603       return false;
2604
2605   return true;
2606 }
2607
2608 bool X86::isPSHUFLWMask(ShuffleVectorSDNode *N) {
2609   SmallVector<int, 8> M;
2610   N->getMask(M);
2611   return ::isPSHUFLWMask(M, N->getValueType(0));
2612 }
2613
2614 /// isPALIGNRMask - Return true if the node specifies a shuffle of elements that
2615 /// is suitable for input to PALIGNR.
2616 static bool isPALIGNRMask(const SmallVectorImpl<int> &Mask, EVT VT,
2617                           bool hasSSSE3) {
2618   int i, e = VT.getVectorNumElements();
2619
2620   // Do not handle v2i64 / v2f64 shuffles with palignr.
2621   if (e < 4 || !hasSSSE3)
2622     return false;
2623
2624   for (i = 0; i != e; ++i)
2625     if (Mask[i] >= 0)
2626       break;
2627
2628   // All undef, not a palignr.
2629   if (i == e)
2630     return false;
2631
2632   // Determine if it's ok to perform a palignr with only the LHS, since we
2633   // don't have access to the actual shuffle elements to see if RHS is undef.
2634   bool Unary = Mask[i] < (int)e;
2635   bool NeedsUnary = false;
2636
2637   int s = Mask[i] - i;
2638
2639   // Check the rest of the elements to see if they are consecutive.
2640   for (++i; i != e; ++i) {
2641     int m = Mask[i];
2642     if (m < 0)
2643       continue;
2644
2645     Unary = Unary && (m < (int)e);
2646     NeedsUnary = NeedsUnary || (m < s);
2647
2648     if (NeedsUnary && !Unary)
2649       return false;
2650     if (Unary && m != ((s+i) & (e-1)))
2651       return false;
2652     if (!Unary && m != (s+i))
2653       return false;
2654   }
2655   return true;
2656 }
2657
2658 bool X86::isPALIGNRMask(ShuffleVectorSDNode *N) {
2659   SmallVector<int, 8> M;
2660   N->getMask(M);
2661   return ::isPALIGNRMask(M, N->getValueType(0), true);
2662 }
2663
2664 /// isSHUFPMask - Return true if the specified VECTOR_SHUFFLE operand
2665 /// specifies a shuffle of elements that is suitable for input to SHUFP*.
2666 static bool isSHUFPMask(const SmallVectorImpl<int> &Mask, EVT VT) {
2667   int NumElems = VT.getVectorNumElements();
2668   if (NumElems != 2 && NumElems != 4)
2669     return false;
2670
2671   int Half = NumElems / 2;
2672   for (int i = 0; i < Half; ++i)
2673     if (!isUndefOrInRange(Mask[i], 0, NumElems))
2674       return false;
2675   for (int i = Half; i < NumElems; ++i)
2676     if (!isUndefOrInRange(Mask[i], NumElems, NumElems*2))
2677       return false;
2678
2679   return true;
2680 }
2681
2682 bool X86::isSHUFPMask(ShuffleVectorSDNode *N) {
2683   SmallVector<int, 8> M;
2684   N->getMask(M);
2685   return ::isSHUFPMask(M, N->getValueType(0));
2686 }
2687
2688 /// isCommutedSHUFP - Returns true if the shuffle mask is exactly
2689 /// the reverse of what x86 shuffles want. x86 shuffles requires the lower
2690 /// half elements to come from vector 1 (which would equal the dest.) and
2691 /// the upper half to come from vector 2.
2692 static bool isCommutedSHUFPMask(const SmallVectorImpl<int> &Mask, EVT VT) {
2693   int NumElems = VT.getVectorNumElements();
2694
2695   if (NumElems != 2 && NumElems != 4)
2696     return false;
2697
2698   int Half = NumElems / 2;
2699   for (int i = 0; i < Half; ++i)
2700     if (!isUndefOrInRange(Mask[i], NumElems, NumElems*2))
2701       return false;
2702   for (int i = Half; i < NumElems; ++i)
2703     if (!isUndefOrInRange(Mask[i], 0, NumElems))
2704       return false;
2705   return true;
2706 }
2707
2708 static bool isCommutedSHUFP(ShuffleVectorSDNode *N) {
2709   SmallVector<int, 8> M;
2710   N->getMask(M);
2711   return isCommutedSHUFPMask(M, N->getValueType(0));
2712 }
2713
2714 /// isMOVHLPSMask - Return true if the specified VECTOR_SHUFFLE operand
2715 /// specifies a shuffle of elements that is suitable for input to MOVHLPS.
2716 bool X86::isMOVHLPSMask(ShuffleVectorSDNode *N) {
2717   if (N->getValueType(0).getVectorNumElements() != 4)
2718     return false;
2719
2720   // Expect bit0 == 6, bit1 == 7, bit2 == 2, bit3 == 3
2721   return isUndefOrEqual(N->getMaskElt(0), 6) &&
2722          isUndefOrEqual(N->getMaskElt(1), 7) &&
2723          isUndefOrEqual(N->getMaskElt(2), 2) &&
2724          isUndefOrEqual(N->getMaskElt(3), 3);
2725 }
2726
2727 /// isMOVHLPS_v_undef_Mask - Special case of isMOVHLPSMask for canonical form
2728 /// of vector_shuffle v, v, <2, 3, 2, 3>, i.e. vector_shuffle v, undef,
2729 /// <2, 3, 2, 3>
2730 bool X86::isMOVHLPS_v_undef_Mask(ShuffleVectorSDNode *N) {
2731   unsigned NumElems = N->getValueType(0).getVectorNumElements();
2732
2733   if (NumElems != 4)
2734     return false;
2735
2736   return isUndefOrEqual(N->getMaskElt(0), 2) &&
2737   isUndefOrEqual(N->getMaskElt(1), 3) &&
2738   isUndefOrEqual(N->getMaskElt(2), 2) &&
2739   isUndefOrEqual(N->getMaskElt(3), 3);
2740 }
2741
2742 /// isMOVLPMask - Return true if the specified VECTOR_SHUFFLE operand
2743 /// specifies a shuffle of elements that is suitable for input to MOVLP{S|D}.
2744 bool X86::isMOVLPMask(ShuffleVectorSDNode *N) {
2745   unsigned NumElems = N->getValueType(0).getVectorNumElements();
2746
2747   if (NumElems != 2 && NumElems != 4)
2748     return false;
2749
2750   for (unsigned i = 0; i < NumElems/2; ++i)
2751     if (!isUndefOrEqual(N->getMaskElt(i), i + NumElems))
2752       return false;
2753
2754   for (unsigned i = NumElems/2; i < NumElems; ++i)
2755     if (!isUndefOrEqual(N->getMaskElt(i), i))
2756       return false;
2757
2758   return true;
2759 }
2760
2761 /// isMOVLHPSMask - Return true if the specified VECTOR_SHUFFLE operand
2762 /// specifies a shuffle of elements that is suitable for input to MOVLHPS.
2763 bool X86::isMOVLHPSMask(ShuffleVectorSDNode *N) {
2764   unsigned NumElems = N->getValueType(0).getVectorNumElements();
2765
2766   if (NumElems != 2 && NumElems != 4)
2767     return false;
2768
2769   for (unsigned i = 0; i < NumElems/2; ++i)
2770     if (!isUndefOrEqual(N->getMaskElt(i), i))
2771       return false;
2772
2773   for (unsigned i = 0; i < NumElems/2; ++i)
2774     if (!isUndefOrEqual(N->getMaskElt(i + NumElems/2), i + NumElems))
2775       return false;
2776
2777   return true;
2778 }
2779
2780 /// isUNPCKLMask - Return true if the specified VECTOR_SHUFFLE operand
2781 /// specifies a shuffle of elements that is suitable for input to UNPCKL.
2782 static bool isUNPCKLMask(const SmallVectorImpl<int> &Mask, EVT VT,
2783                          bool V2IsSplat = false) {
2784   int NumElts = VT.getVectorNumElements();
2785   if (NumElts != 2 && NumElts != 4 && NumElts != 8 && NumElts != 16)
2786     return false;
2787
2788   for (int i = 0, j = 0; i != NumElts; i += 2, ++j) {
2789     int BitI  = Mask[i];
2790     int BitI1 = Mask[i+1];
2791     if (!isUndefOrEqual(BitI, j))
2792       return false;
2793     if (V2IsSplat) {
2794       if (!isUndefOrEqual(BitI1, NumElts))
2795         return false;
2796     } else {
2797       if (!isUndefOrEqual(BitI1, j + NumElts))
2798         return false;
2799     }
2800   }
2801   return true;
2802 }
2803
2804 bool X86::isUNPCKLMask(ShuffleVectorSDNode *N, bool V2IsSplat) {
2805   SmallVector<int, 8> M;
2806   N->getMask(M);
2807   return ::isUNPCKLMask(M, N->getValueType(0), V2IsSplat);
2808 }
2809
2810 /// isUNPCKHMask - Return true if the specified VECTOR_SHUFFLE operand
2811 /// specifies a shuffle of elements that is suitable for input to UNPCKH.
2812 static bool isUNPCKHMask(const SmallVectorImpl<int> &Mask, EVT VT,
2813                          bool V2IsSplat = false) {
2814   int NumElts = VT.getVectorNumElements();
2815   if (NumElts != 2 && NumElts != 4 && NumElts != 8 && NumElts != 16)
2816     return false;
2817
2818   for (int i = 0, j = 0; i != NumElts; i += 2, ++j) {
2819     int BitI  = Mask[i];
2820     int BitI1 = Mask[i+1];
2821     if (!isUndefOrEqual(BitI, j + NumElts/2))
2822       return false;
2823     if (V2IsSplat) {
2824       if (isUndefOrEqual(BitI1, NumElts))
2825         return false;
2826     } else {
2827       if (!isUndefOrEqual(BitI1, j + NumElts/2 + NumElts))
2828         return false;
2829     }
2830   }
2831   return true;
2832 }
2833
2834 bool X86::isUNPCKHMask(ShuffleVectorSDNode *N, bool V2IsSplat) {
2835   SmallVector<int, 8> M;
2836   N->getMask(M);
2837   return ::isUNPCKHMask(M, N->getValueType(0), V2IsSplat);
2838 }
2839
2840 /// isUNPCKL_v_undef_Mask - Special case of isUNPCKLMask for canonical form
2841 /// of vector_shuffle v, v, <0, 4, 1, 5>, i.e. vector_shuffle v, undef,
2842 /// <0, 0, 1, 1>
2843 static bool isUNPCKL_v_undef_Mask(const SmallVectorImpl<int> &Mask, EVT VT) {
2844   int NumElems = VT.getVectorNumElements();
2845   if (NumElems != 2 && NumElems != 4 && NumElems != 8 && NumElems != 16)
2846     return false;
2847
2848   for (int i = 0, j = 0; i != NumElems; i += 2, ++j) {
2849     int BitI  = Mask[i];
2850     int BitI1 = Mask[i+1];
2851     if (!isUndefOrEqual(BitI, j))
2852       return false;
2853     if (!isUndefOrEqual(BitI1, j))
2854       return false;
2855   }
2856   return true;
2857 }
2858
2859 bool X86::isUNPCKL_v_undef_Mask(ShuffleVectorSDNode *N) {
2860   SmallVector<int, 8> M;
2861   N->getMask(M);
2862   return ::isUNPCKL_v_undef_Mask(M, N->getValueType(0));
2863 }
2864
2865 /// isUNPCKH_v_undef_Mask - Special case of isUNPCKHMask for canonical form
2866 /// of vector_shuffle v, v, <2, 6, 3, 7>, i.e. vector_shuffle v, undef,
2867 /// <2, 2, 3, 3>
2868 static bool isUNPCKH_v_undef_Mask(const SmallVectorImpl<int> &Mask, EVT VT) {
2869   int NumElems = VT.getVectorNumElements();
2870   if (NumElems != 2 && NumElems != 4 && NumElems != 8 && NumElems != 16)
2871     return false;
2872
2873   for (int i = 0, j = NumElems / 2; i != NumElems; i += 2, ++j) {
2874     int BitI  = Mask[i];
2875     int BitI1 = Mask[i+1];
2876     if (!isUndefOrEqual(BitI, j))
2877       return false;
2878     if (!isUndefOrEqual(BitI1, j))
2879       return false;
2880   }
2881   return true;
2882 }
2883
2884 bool X86::isUNPCKH_v_undef_Mask(ShuffleVectorSDNode *N) {
2885   SmallVector<int, 8> M;
2886   N->getMask(M);
2887   return ::isUNPCKH_v_undef_Mask(M, N->getValueType(0));
2888 }
2889
2890 /// isMOVLMask - Return true if the specified VECTOR_SHUFFLE operand
2891 /// specifies a shuffle of elements that is suitable for input to MOVSS,
2892 /// MOVSD, and MOVD, i.e. setting the lowest element.
2893 static bool isMOVLMask(const SmallVectorImpl<int> &Mask, EVT VT) {
2894   if (VT.getVectorElementType().getSizeInBits() < 32)
2895     return false;
2896
2897   int NumElts = VT.getVectorNumElements();
2898
2899   if (!isUndefOrEqual(Mask[0], NumElts))
2900     return false;
2901
2902   for (int i = 1; i < NumElts; ++i)
2903     if (!isUndefOrEqual(Mask[i], i))
2904       return false;
2905
2906   return true;
2907 }
2908
2909 bool X86::isMOVLMask(ShuffleVectorSDNode *N) {
2910   SmallVector<int, 8> M;
2911   N->getMask(M);
2912   return ::isMOVLMask(M, N->getValueType(0));
2913 }
2914
2915 /// isCommutedMOVL - Returns true if the shuffle mask is except the reverse
2916 /// of what x86 movss want. X86 movs requires the lowest  element to be lowest
2917 /// element of vector 2 and the other elements to come from vector 1 in order.
2918 static bool isCommutedMOVLMask(const SmallVectorImpl<int> &Mask, EVT VT,
2919                                bool V2IsSplat = false, bool V2IsUndef = false) {
2920   int NumOps = VT.getVectorNumElements();
2921   if (NumOps != 2 && NumOps != 4 && NumOps != 8 && NumOps != 16)
2922     return false;
2923
2924   if (!isUndefOrEqual(Mask[0], 0))
2925     return false;
2926
2927   for (int i = 1; i < NumOps; ++i)
2928     if (!(isUndefOrEqual(Mask[i], i+NumOps) ||
2929           (V2IsUndef && isUndefOrInRange(Mask[i], NumOps, NumOps*2)) ||
2930           (V2IsSplat && isUndefOrEqual(Mask[i], NumOps))))
2931       return false;
2932
2933   return true;
2934 }
2935
2936 static bool isCommutedMOVL(ShuffleVectorSDNode *N, bool V2IsSplat = false,
2937                            bool V2IsUndef = false) {
2938   SmallVector<int, 8> M;
2939   N->getMask(M);
2940   return isCommutedMOVLMask(M, N->getValueType(0), V2IsSplat, V2IsUndef);
2941 }
2942
2943 /// isMOVSHDUPMask - Return true if the specified VECTOR_SHUFFLE operand
2944 /// specifies a shuffle of elements that is suitable for input to MOVSHDUP.
2945 bool X86::isMOVSHDUPMask(ShuffleVectorSDNode *N) {
2946   if (N->getValueType(0).getVectorNumElements() != 4)
2947     return false;
2948
2949   // Expect 1, 1, 3, 3
2950   for (unsigned i = 0; i < 2; ++i) {
2951     int Elt = N->getMaskElt(i);
2952     if (Elt >= 0 && Elt != 1)
2953       return false;
2954   }
2955
2956   bool HasHi = false;
2957   for (unsigned i = 2; i < 4; ++i) {
2958     int Elt = N->getMaskElt(i);
2959     if (Elt >= 0 && Elt != 3)
2960       return false;
2961     if (Elt == 3)
2962       HasHi = true;
2963   }
2964   // Don't use movshdup if it can be done with a shufps.
2965   // FIXME: verify that matching u, u, 3, 3 is what we want.
2966   return HasHi;
2967 }
2968
2969 /// isMOVSLDUPMask - Return true if the specified VECTOR_SHUFFLE operand
2970 /// specifies a shuffle of elements that is suitable for input to MOVSLDUP.
2971 bool X86::isMOVSLDUPMask(ShuffleVectorSDNode *N) {
2972   if (N->getValueType(0).getVectorNumElements() != 4)
2973     return false;
2974
2975   // Expect 0, 0, 2, 2
2976   for (unsigned i = 0; i < 2; ++i)
2977     if (N->getMaskElt(i) > 0)
2978       return false;
2979
2980   bool HasHi = false;
2981   for (unsigned i = 2; i < 4; ++i) {
2982     int Elt = N->getMaskElt(i);
2983     if (Elt >= 0 && Elt != 2)
2984       return false;
2985     if (Elt == 2)
2986       HasHi = true;
2987   }
2988   // Don't use movsldup if it can be done with a shufps.
2989   return HasHi;
2990 }
2991
2992 /// isMOVDDUPMask - Return true if the specified VECTOR_SHUFFLE operand
2993 /// specifies a shuffle of elements that is suitable for input to MOVDDUP.
2994 bool X86::isMOVDDUPMask(ShuffleVectorSDNode *N) {
2995   int e = N->getValueType(0).getVectorNumElements() / 2;
2996
2997   for (int i = 0; i < e; ++i)
2998     if (!isUndefOrEqual(N->getMaskElt(i), i))
2999       return false;
3000   for (int i = 0; i < e; ++i)
3001     if (!isUndefOrEqual(N->getMaskElt(e+i), i))
3002       return false;
3003   return true;
3004 }
3005
3006 /// getShuffleSHUFImmediate - Return the appropriate immediate to shuffle
3007 /// the specified VECTOR_SHUFFLE mask with PSHUF* and SHUFP* instructions.
3008 unsigned X86::getShuffleSHUFImmediate(SDNode *N) {
3009   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
3010   int NumOperands = SVOp->getValueType(0).getVectorNumElements();
3011
3012   unsigned Shift = (NumOperands == 4) ? 2 : 1;
3013   unsigned Mask = 0;
3014   for (int i = 0; i < NumOperands; ++i) {
3015     int Val = SVOp->getMaskElt(NumOperands-i-1);
3016     if (Val < 0) Val = 0;
3017     if (Val >= NumOperands) Val -= NumOperands;
3018     Mask |= Val;
3019     if (i != NumOperands - 1)
3020       Mask <<= Shift;
3021   }
3022   return Mask;
3023 }
3024
3025 /// getShufflePSHUFHWImmediate - Return the appropriate immediate to shuffle
3026 /// the specified VECTOR_SHUFFLE mask with the PSHUFHW instruction.
3027 unsigned X86::getShufflePSHUFHWImmediate(SDNode *N) {
3028   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
3029   unsigned Mask = 0;
3030   // 8 nodes, but we only care about the last 4.
3031   for (unsigned i = 7; i >= 4; --i) {
3032     int Val = SVOp->getMaskElt(i);
3033     if (Val >= 0)
3034       Mask |= (Val - 4);
3035     if (i != 4)
3036       Mask <<= 2;
3037   }
3038   return Mask;
3039 }
3040
3041 /// getShufflePSHUFLWImmediate - Return the appropriate immediate to shuffle
3042 /// the specified VECTOR_SHUFFLE mask with the PSHUFLW instruction.
3043 unsigned X86::getShufflePSHUFLWImmediate(SDNode *N) {
3044   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
3045   unsigned Mask = 0;
3046   // 8 nodes, but we only care about the first 4.
3047   for (int i = 3; i >= 0; --i) {
3048     int Val = SVOp->getMaskElt(i);
3049     if (Val >= 0)
3050       Mask |= Val;
3051     if (i != 0)
3052       Mask <<= 2;
3053   }
3054   return Mask;
3055 }
3056
3057 /// getShufflePALIGNRImmediate - Return the appropriate immediate to shuffle
3058 /// the specified VECTOR_SHUFFLE mask with the PALIGNR instruction.
3059 unsigned X86::getShufflePALIGNRImmediate(SDNode *N) {
3060   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
3061   EVT VVT = N->getValueType(0);
3062   unsigned EltSize = VVT.getVectorElementType().getSizeInBits() >> 3;
3063   int Val = 0;
3064
3065   unsigned i, e;
3066   for (i = 0, e = VVT.getVectorNumElements(); i != e; ++i) {
3067     Val = SVOp->getMaskElt(i);
3068     if (Val >= 0)
3069       break;
3070   }
3071   return (Val - i) * EltSize;
3072 }
3073
3074 /// isZeroNode - Returns true if Elt is a constant zero or a floating point
3075 /// constant +0.0.
3076 bool X86::isZeroNode(SDValue Elt) {
3077   return ((isa<ConstantSDNode>(Elt) &&
3078            cast<ConstantSDNode>(Elt)->getZExtValue() == 0) ||
3079           (isa<ConstantFPSDNode>(Elt) &&
3080            cast<ConstantFPSDNode>(Elt)->getValueAPF().isPosZero()));
3081 }
3082
3083 /// CommuteVectorShuffle - Swap vector_shuffle operands as well as values in
3084 /// their permute mask.
3085 static SDValue CommuteVectorShuffle(ShuffleVectorSDNode *SVOp,
3086                                     SelectionDAG &DAG) {
3087   EVT VT = SVOp->getValueType(0);
3088   unsigned NumElems = VT.getVectorNumElements();
3089   SmallVector<int, 8> MaskVec;
3090
3091   for (unsigned i = 0; i != NumElems; ++i) {
3092     int idx = SVOp->getMaskElt(i);
3093     if (idx < 0)
3094       MaskVec.push_back(idx);
3095     else if (idx < (int)NumElems)
3096       MaskVec.push_back(idx + NumElems);
3097     else
3098       MaskVec.push_back(idx - NumElems);
3099   }
3100   return DAG.getVectorShuffle(VT, SVOp->getDebugLoc(), SVOp->getOperand(1),
3101                               SVOp->getOperand(0), &MaskVec[0]);
3102 }
3103
3104 /// CommuteVectorShuffleMask - Change values in a shuffle permute mask assuming
3105 /// the two vector operands have swapped position.
3106 static void CommuteVectorShuffleMask(SmallVectorImpl<int> &Mask, EVT VT) {
3107   unsigned NumElems = VT.getVectorNumElements();
3108   for (unsigned i = 0; i != NumElems; ++i) {
3109     int idx = Mask[i];
3110     if (idx < 0)
3111       continue;
3112     else if (idx < (int)NumElems)
3113       Mask[i] = idx + NumElems;
3114     else
3115       Mask[i] = idx - NumElems;
3116   }
3117 }
3118
3119 /// ShouldXformToMOVHLPS - Return true if the node should be transformed to
3120 /// match movhlps. The lower half elements should come from upper half of
3121 /// V1 (and in order), and the upper half elements should come from the upper
3122 /// half of V2 (and in order).
3123 static bool ShouldXformToMOVHLPS(ShuffleVectorSDNode *Op) {
3124   if (Op->getValueType(0).getVectorNumElements() != 4)
3125     return false;
3126   for (unsigned i = 0, e = 2; i != e; ++i)
3127     if (!isUndefOrEqual(Op->getMaskElt(i), i+2))
3128       return false;
3129   for (unsigned i = 2; i != 4; ++i)
3130     if (!isUndefOrEqual(Op->getMaskElt(i), i+4))
3131       return false;
3132   return true;
3133 }
3134
3135 /// isScalarLoadToVector - Returns true if the node is a scalar load that
3136 /// is promoted to a vector. It also returns the LoadSDNode by reference if
3137 /// required.
3138 static bool isScalarLoadToVector(SDNode *N, LoadSDNode **LD = NULL) {
3139   if (N->getOpcode() != ISD::SCALAR_TO_VECTOR)
3140     return false;
3141   N = N->getOperand(0).getNode();
3142   if (!ISD::isNON_EXTLoad(N))
3143     return false;
3144   if (LD)
3145     *LD = cast<LoadSDNode>(N);
3146   return true;
3147 }
3148
3149 /// ShouldXformToMOVLP{S|D} - Return true if the node should be transformed to
3150 /// match movlp{s|d}. The lower half elements should come from lower half of
3151 /// V1 (and in order), and the upper half elements should come from the upper
3152 /// half of V2 (and in order). And since V1 will become the source of the
3153 /// MOVLP, it must be either a vector load or a scalar load to vector.
3154 static bool ShouldXformToMOVLP(SDNode *V1, SDNode *V2,
3155                                ShuffleVectorSDNode *Op) {
3156   if (!ISD::isNON_EXTLoad(V1) && !isScalarLoadToVector(V1))
3157     return false;
3158   // Is V2 is a vector load, don't do this transformation. We will try to use
3159   // load folding shufps op.
3160   if (ISD::isNON_EXTLoad(V2))
3161     return false;
3162
3163   unsigned NumElems = Op->getValueType(0).getVectorNumElements();
3164
3165   if (NumElems != 2 && NumElems != 4)
3166     return false;
3167   for (unsigned i = 0, e = NumElems/2; i != e; ++i)
3168     if (!isUndefOrEqual(Op->getMaskElt(i), i))
3169       return false;
3170   for (unsigned i = NumElems/2; i != NumElems; ++i)
3171     if (!isUndefOrEqual(Op->getMaskElt(i), i+NumElems))
3172       return false;
3173   return true;
3174 }
3175
3176 /// isSplatVector - Returns true if N is a BUILD_VECTOR node whose elements are
3177 /// all the same.
3178 static bool isSplatVector(SDNode *N) {
3179   if (N->getOpcode() != ISD::BUILD_VECTOR)
3180     return false;
3181
3182   SDValue SplatValue = N->getOperand(0);
3183   for (unsigned i = 1, e = N->getNumOperands(); i != e; ++i)
3184     if (N->getOperand(i) != SplatValue)
3185       return false;
3186   return true;
3187 }
3188
3189 /// isZeroShuffle - Returns true if N is a VECTOR_SHUFFLE that can be resolved
3190 /// to an zero vector.
3191 /// FIXME: move to dag combiner / method on ShuffleVectorSDNode
3192 static bool isZeroShuffle(ShuffleVectorSDNode *N) {
3193   SDValue V1 = N->getOperand(0);
3194   SDValue V2 = N->getOperand(1);
3195   unsigned NumElems = N->getValueType(0).getVectorNumElements();
3196   for (unsigned i = 0; i != NumElems; ++i) {
3197     int Idx = N->getMaskElt(i);
3198     if (Idx >= (int)NumElems) {
3199       unsigned Opc = V2.getOpcode();
3200       if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V2.getNode()))
3201         continue;
3202       if (Opc != ISD::BUILD_VECTOR ||
3203           !X86::isZeroNode(V2.getOperand(Idx-NumElems)))
3204         return false;
3205     } else if (Idx >= 0) {
3206       unsigned Opc = V1.getOpcode();
3207       if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V1.getNode()))
3208         continue;
3209       if (Opc != ISD::BUILD_VECTOR ||
3210           !X86::isZeroNode(V1.getOperand(Idx)))
3211         return false;
3212     }
3213   }
3214   return true;
3215 }
3216
3217 /// getZeroVector - Returns a vector of specified type with all zero elements.
3218 ///
3219 static SDValue getZeroVector(EVT VT, bool HasSSE2, SelectionDAG &DAG,
3220                              DebugLoc dl) {
3221   assert(VT.isVector() && "Expected a vector type");
3222
3223   // Always build zero vectors as <4 x i32> or <2 x i32> bitcasted to their dest
3224   // type.  This ensures they get CSE'd.
3225   SDValue Vec;
3226   if (VT.getSizeInBits() == 64) { // MMX
3227     SDValue Cst = DAG.getTargetConstant(0, MVT::i32);
3228     Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i32, Cst, Cst);
3229   } else if (HasSSE2) {  // SSE2
3230     SDValue Cst = DAG.getTargetConstant(0, MVT::i32);
3231     Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
3232   } else { // SSE1
3233     SDValue Cst = DAG.getTargetConstantFP(+0.0, MVT::f32);
3234     Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4f32, Cst, Cst, Cst, Cst);
3235   }
3236   return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Vec);
3237 }
3238
3239 /// getOnesVector - Returns a vector of specified type with all bits set.
3240 ///
3241 static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, DebugLoc dl) {
3242   assert(VT.isVector() && "Expected a vector type");
3243
3244   // Always build ones vectors as <4 x i32> or <2 x i32> bitcasted to their dest
3245   // type.  This ensures they get CSE'd.
3246   SDValue Cst = DAG.getTargetConstant(~0U, MVT::i32);
3247   SDValue Vec;
3248   if (VT.getSizeInBits() == 64)  // MMX
3249     Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i32, Cst, Cst);
3250   else                                              // SSE
3251     Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
3252   return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Vec);
3253 }
3254
3255
3256 /// NormalizeMask - V2 is a splat, modify the mask (if needed) so all elements
3257 /// that point to V2 points to its first element.
3258 static SDValue NormalizeMask(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
3259   EVT VT = SVOp->getValueType(0);
3260   unsigned NumElems = VT.getVectorNumElements();
3261
3262   bool Changed = false;
3263   SmallVector<int, 8> MaskVec;
3264   SVOp->getMask(MaskVec);
3265
3266   for (unsigned i = 0; i != NumElems; ++i) {
3267     if (MaskVec[i] > (int)NumElems) {
3268       MaskVec[i] = NumElems;
3269       Changed = true;
3270     }
3271   }
3272   if (Changed)
3273     return DAG.getVectorShuffle(VT, SVOp->getDebugLoc(), SVOp->getOperand(0),
3274                                 SVOp->getOperand(1), &MaskVec[0]);
3275   return SDValue(SVOp, 0);
3276 }
3277
3278 /// getMOVLMask - Returns a vector_shuffle mask for an movs{s|d}, movd
3279 /// operation of specified width.
3280 static SDValue getMOVL(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1,
3281                        SDValue V2) {
3282   unsigned NumElems = VT.getVectorNumElements();
3283   SmallVector<int, 8> Mask;
3284   Mask.push_back(NumElems);
3285   for (unsigned i = 1; i != NumElems; ++i)
3286     Mask.push_back(i);
3287   return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
3288 }
3289
3290 /// getUnpackl - Returns a vector_shuffle node for an unpackl operation.
3291 static SDValue getUnpackl(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1,
3292                           SDValue V2) {
3293   unsigned NumElems = VT.getVectorNumElements();
3294   SmallVector<int, 8> Mask;
3295   for (unsigned i = 0, e = NumElems/2; i != e; ++i) {
3296     Mask.push_back(i);
3297     Mask.push_back(i + NumElems);
3298   }
3299   return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
3300 }
3301
3302 /// getUnpackhMask - Returns a vector_shuffle node for an unpackh operation.
3303 static SDValue getUnpackh(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1,
3304                           SDValue V2) {
3305   unsigned NumElems = VT.getVectorNumElements();
3306   unsigned Half = NumElems/2;
3307   SmallVector<int, 8> Mask;
3308   for (unsigned i = 0; i != Half; ++i) {
3309     Mask.push_back(i + Half);
3310     Mask.push_back(i + NumElems + Half);
3311   }
3312   return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
3313 }
3314
3315 /// PromoteSplat - Promote a splat of v4f32, v8i16 or v16i8 to v4i32.
3316 static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG,
3317                             bool HasSSE2) {
3318   if (SV->getValueType(0).getVectorNumElements() <= 4)
3319     return SDValue(SV, 0);
3320
3321   EVT PVT = MVT::v4f32;
3322   EVT VT = SV->getValueType(0);
3323   DebugLoc dl = SV->getDebugLoc();
3324   SDValue V1 = SV->getOperand(0);
3325   int NumElems = VT.getVectorNumElements();
3326   int EltNo = SV->getSplatIndex();
3327
3328   // unpack elements to the correct location
3329   while (NumElems > 4) {
3330     if (EltNo < NumElems/2) {
3331       V1 = getUnpackl(DAG, dl, VT, V1, V1);
3332     } else {
3333       V1 = getUnpackh(DAG, dl, VT, V1, V1);
3334       EltNo -= NumElems/2;
3335     }
3336     NumElems >>= 1;
3337   }
3338
3339   // Perform the splat.
3340   int SplatMask[4] = { EltNo, EltNo, EltNo, EltNo };
3341   V1 = DAG.getNode(ISD::BIT_CONVERT, dl, PVT, V1);
3342   V1 = DAG.getVectorShuffle(PVT, dl, V1, DAG.getUNDEF(PVT), &SplatMask[0]);
3343   return DAG.getNode(ISD::BIT_CONVERT, dl, VT, V1);
3344 }
3345
3346 /// getShuffleVectorZeroOrUndef - Return a vector_shuffle of the specified
3347 /// vector of zero or undef vector.  This produces a shuffle where the low
3348 /// element of V2 is swizzled into the zero/undef vector, landing at element
3349 /// Idx.  This produces a shuffle mask like 4,1,2,3 (idx=0) or  0,1,2,4 (idx=3).
3350 static SDValue getShuffleVectorZeroOrUndef(SDValue V2, unsigned Idx,
3351                                              bool isZero, bool HasSSE2,
3352                                              SelectionDAG &DAG) {
3353   EVT VT = V2.getValueType();
3354   SDValue V1 = isZero
3355     ? getZeroVector(VT, HasSSE2, DAG, V2.getDebugLoc()) : DAG.getUNDEF(VT);
3356   unsigned NumElems = VT.getVectorNumElements();
3357   SmallVector<int, 16> MaskVec;
3358   for (unsigned i = 0; i != NumElems; ++i)
3359     // If this is the insertion idx, put the low elt of V2 here.
3360     MaskVec.push_back(i == Idx ? NumElems : i);
3361   return DAG.getVectorShuffle(VT, V2.getDebugLoc(), V1, V2, &MaskVec[0]);
3362 }
3363
3364 /// getNumOfConsecutiveZeros - Return the number of elements in a result of
3365 /// a shuffle that is zero.
3366 static
3367 unsigned getNumOfConsecutiveZeros(ShuffleVectorSDNode *SVOp, int NumElems,
3368                                   bool Low, SelectionDAG &DAG) {
3369   unsigned NumZeros = 0;
3370   for (int i = 0; i < NumElems; ++i) {
3371     unsigned Index = Low ? i : NumElems-i-1;
3372     int Idx = SVOp->getMaskElt(Index);
3373     if (Idx < 0) {
3374       ++NumZeros;
3375       continue;
3376     }
3377     SDValue Elt = DAG.getShuffleScalarElt(SVOp, Index);
3378     if (Elt.getNode() && X86::isZeroNode(Elt))
3379       ++NumZeros;
3380     else
3381       break;
3382   }
3383   return NumZeros;
3384 }
3385
3386 /// isVectorShift - Returns true if the shuffle can be implemented as a
3387 /// logical left or right shift of a vector.
3388 /// FIXME: split into pslldqi, psrldqi, palignr variants.
3389 static bool isVectorShift(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG,
3390                           bool &isLeft, SDValue &ShVal, unsigned &ShAmt) {
3391   int NumElems = SVOp->getValueType(0).getVectorNumElements();
3392
3393   isLeft = true;
3394   unsigned NumZeros = getNumOfConsecutiveZeros(SVOp, NumElems, true, DAG);
3395   if (!NumZeros) {
3396     isLeft = false;
3397     NumZeros = getNumOfConsecutiveZeros(SVOp, NumElems, false, DAG);
3398     if (!NumZeros)
3399       return false;
3400   }
3401   bool SeenV1 = false;
3402   bool SeenV2 = false;
3403   for (int i = NumZeros; i < NumElems; ++i) {
3404     int Val = isLeft ? (i - NumZeros) : i;
3405     int Idx = SVOp->getMaskElt(isLeft ? i : (i - NumZeros));
3406     if (Idx < 0)
3407       continue;
3408     if (Idx < NumElems)
3409       SeenV1 = true;
3410     else {
3411       Idx -= NumElems;
3412       SeenV2 = true;
3413     }
3414     if (Idx != Val)
3415       return false;
3416   }
3417   if (SeenV1 && SeenV2)
3418     return false;
3419
3420   ShVal = SeenV1 ? SVOp->getOperand(0) : SVOp->getOperand(1);
3421   ShAmt = NumZeros;
3422   return true;
3423 }
3424
3425
3426 /// LowerBuildVectorv16i8 - Custom lower build_vector of v16i8.
3427 ///
3428 static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
3429                                        unsigned NumNonZero, unsigned NumZero,
3430                                        SelectionDAG &DAG, TargetLowering &TLI) {
3431   if (NumNonZero > 8)
3432     return SDValue();
3433
3434   DebugLoc dl = Op.getDebugLoc();
3435   SDValue V(0, 0);
3436   bool First = true;
3437   for (unsigned i = 0; i < 16; ++i) {
3438     bool ThisIsNonZero = (NonZeros & (1 << i)) != 0;
3439     if (ThisIsNonZero && First) {
3440       if (NumZero)
3441         V = getZeroVector(MVT::v8i16, true, DAG, dl);
3442       else
3443         V = DAG.getUNDEF(MVT::v8i16);
3444       First = false;
3445     }
3446
3447     if ((i & 1) != 0) {
3448       SDValue ThisElt(0, 0), LastElt(0, 0);
3449       bool LastIsNonZero = (NonZeros & (1 << (i-1))) != 0;
3450       if (LastIsNonZero) {
3451         LastElt = DAG.getNode(ISD::ZERO_EXTEND, dl,
3452                               MVT::i16, Op.getOperand(i-1));
3453       }
3454       if (ThisIsNonZero) {
3455         ThisElt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i));
3456         ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16,
3457                               ThisElt, DAG.getConstant(8, MVT::i8));
3458         if (LastIsNonZero)
3459           ThisElt = DAG.getNode(ISD::OR, dl, MVT::i16, ThisElt, LastElt);
3460       } else
3461         ThisElt = LastElt;
3462
3463       if (ThisElt.getNode())
3464         V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt,
3465                         DAG.getIntPtrConstant(i/2));
3466     }
3467   }
3468
3469   return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v16i8, V);
3470 }
3471
3472 /// LowerBuildVectorv8i16 - Custom lower build_vector of v8i16.
3473 ///
3474 static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros,
3475                                        unsigned NumNonZero, unsigned NumZero,
3476                                        SelectionDAG &DAG, TargetLowering &TLI) {
3477   if (NumNonZero > 4)
3478     return SDValue();
3479
3480   DebugLoc dl = Op.getDebugLoc();
3481   SDValue V(0, 0);
3482   bool First = true;
3483   for (unsigned i = 0; i < 8; ++i) {
3484     bool isNonZero = (NonZeros & (1 << i)) != 0;
3485     if (isNonZero) {
3486       if (First) {
3487         if (NumZero)
3488           V = getZeroVector(MVT::v8i16, true, DAG, dl);
3489         else
3490           V = DAG.getUNDEF(MVT::v8i16);
3491         First = false;
3492       }
3493       V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl,
3494                       MVT::v8i16, V, Op.getOperand(i),
3495                       DAG.getIntPtrConstant(i));
3496     }
3497   }
3498
3499   return V;
3500 }
3501
3502 /// getVShift - Return a vector logical shift node.
3503 ///
3504 static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp,
3505                          unsigned NumBits, SelectionDAG &DAG,
3506                          const TargetLowering &TLI, DebugLoc dl) {
3507   bool isMMX = VT.getSizeInBits() == 64;
3508   EVT ShVT = isMMX ? MVT::v1i64 : MVT::v2i64;
3509   unsigned Opc = isLeft ? X86ISD::VSHL : X86ISD::VSRL;
3510   SrcOp = DAG.getNode(ISD::BIT_CONVERT, dl, ShVT, SrcOp);
3511   return DAG.getNode(ISD::BIT_CONVERT, dl, VT,
3512                      DAG.getNode(Opc, dl, ShVT, SrcOp,
3513                              DAG.getConstant(NumBits, TLI.getShiftAmountTy())));
3514 }
3515
3516 SDValue
3517 X86TargetLowering::LowerAsSplatVectorLoad(SDValue SrcOp, EVT VT, DebugLoc dl,
3518                                           SelectionDAG &DAG) {
3519
3520   // Check if the scalar load can be widened into a vector load. And if
3521   // the address is "base + cst" see if the cst can be "absorbed" into
3522   // the shuffle mask.
3523   if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) {
3524     SDValue Ptr = LD->getBasePtr();
3525     if (!ISD::isNormalLoad(LD) || LD->isVolatile())
3526       return SDValue();
3527     EVT PVT = LD->getValueType(0);
3528     if (PVT != MVT::i32 && PVT != MVT::f32)
3529       return SDValue();
3530
3531     int FI = -1;
3532     int64_t Offset = 0;
3533     if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) {
3534       FI = FINode->getIndex();
3535       Offset = 0;
3536     } else if (Ptr.getOpcode() == ISD::ADD &&
3537                isa<ConstantSDNode>(Ptr.getOperand(1)) &&
3538                isa<FrameIndexSDNode>(Ptr.getOperand(0))) {
3539       FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
3540       Offset = Ptr.getConstantOperandVal(1);
3541       Ptr = Ptr.getOperand(0);
3542     } else {
3543       return SDValue();
3544     }
3545
3546     SDValue Chain = LD->getChain();
3547     // Make sure the stack object alignment is at least 16.
3548     MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
3549     if (DAG.InferPtrAlignment(Ptr) < 16) {
3550       if (MFI->isFixedObjectIndex(FI)) {
3551         // Can't change the alignment. FIXME: It's possible to compute
3552         // the exact stack offset and reference FI + adjust offset instead.
3553         // If someone *really* cares about this. That's the way to implement it.
3554         return SDValue();
3555       } else {
3556         MFI->setObjectAlignment(FI, 16);
3557       }
3558     }
3559
3560     // (Offset % 16) must be multiple of 4. Then address is then
3561     // Ptr + (Offset & ~15).
3562     if (Offset < 0)
3563       return SDValue();
3564     if ((Offset % 16) & 3)
3565       return SDValue();
3566     int64_t StartOffset = Offset & ~15;
3567     if (StartOffset)
3568       Ptr = DAG.getNode(ISD::ADD, Ptr.getDebugLoc(), Ptr.getValueType(),
3569                         Ptr,DAG.getConstant(StartOffset, Ptr.getValueType()));
3570
3571     int EltNo = (Offset - StartOffset) >> 2;
3572     int Mask[4] = { EltNo, EltNo, EltNo, EltNo };
3573     EVT VT = (PVT == MVT::i32) ? MVT::v4i32 : MVT::v4f32;
3574     SDValue V1 = DAG.getLoad(VT, dl, Chain, Ptr,LD->getSrcValue(),0,
3575                              false, false, 0);
3576     // Canonicalize it to a v4i32 shuffle.
3577     V1 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v4i32, V1);
3578     return DAG.getNode(ISD::BIT_CONVERT, dl, VT,
3579                        DAG.getVectorShuffle(MVT::v4i32, dl, V1,
3580                                             DAG.getUNDEF(MVT::v4i32), &Mask[0]));
3581   }
3582
3583   return SDValue();
3584 }
3585
3586 SDValue
3587 X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) {
3588   DebugLoc dl = Op.getDebugLoc();
3589   // All zero's are handled with pxor, all one's are handled with pcmpeqd.
3590   if (ISD::isBuildVectorAllZeros(Op.getNode())
3591       || ISD::isBuildVectorAllOnes(Op.getNode())) {
3592     // Canonicalize this to either <4 x i32> or <2 x i32> (SSE vs MMX) to
3593     // 1) ensure the zero vectors are CSE'd, and 2) ensure that i64 scalars are
3594     // eliminated on x86-32 hosts.
3595     if (Op.getValueType() == MVT::v4i32 || Op.getValueType() == MVT::v2i32)
3596       return Op;
3597
3598     if (ISD::isBuildVectorAllOnes(Op.getNode()))
3599       return getOnesVector(Op.getValueType(), DAG, dl);
3600     return getZeroVector(Op.getValueType(), Subtarget->hasSSE2(), DAG, dl);
3601   }
3602
3603   EVT VT = Op.getValueType();
3604   EVT ExtVT = VT.getVectorElementType();
3605   unsigned EVTBits = ExtVT.getSizeInBits();
3606
3607   unsigned NumElems = Op.getNumOperands();
3608   unsigned NumZero  = 0;
3609   unsigned NumNonZero = 0;
3610   unsigned NonZeros = 0;
3611   bool IsAllConstants = true;
3612   SmallSet<SDValue, 8> Values;
3613   for (unsigned i = 0; i < NumElems; ++i) {
3614     SDValue Elt = Op.getOperand(i);
3615     if (Elt.getOpcode() == ISD::UNDEF)
3616       continue;
3617     Values.insert(Elt);
3618     if (Elt.getOpcode() != ISD::Constant &&
3619         Elt.getOpcode() != ISD::ConstantFP)
3620       IsAllConstants = false;
3621     if (X86::isZeroNode(Elt))
3622       NumZero++;
3623     else {
3624       NonZeros |= (1 << i);
3625       NumNonZero++;
3626     }
3627   }
3628
3629   if (NumNonZero == 0) {
3630     // All undef vector. Return an UNDEF.  All zero vectors were handled above.
3631     return DAG.getUNDEF(VT);
3632   }
3633
3634   // Special case for single non-zero, non-undef, element.
3635   if (NumNonZero == 1) {
3636     unsigned Idx = CountTrailingZeros_32(NonZeros);
3637     SDValue Item = Op.getOperand(Idx);
3638
3639     // If this is an insertion of an i64 value on x86-32, and if the top bits of
3640     // the value are obviously zero, truncate the value to i32 and do the
3641     // insertion that way.  Only do this if the value is non-constant or if the
3642     // value is a constant being inserted into element 0.  It is cheaper to do
3643     // a constant pool load than it is to do a movd + shuffle.
3644     if (ExtVT == MVT::i64 && !Subtarget->is64Bit() &&
3645         (!IsAllConstants || Idx == 0)) {
3646       if (DAG.MaskedValueIsZero(Item, APInt::getBitsSet(64, 32, 64))) {
3647         // Handle MMX and SSE both.
3648         EVT VecVT = VT == MVT::v2i64 ? MVT::v4i32 : MVT::v2i32;
3649         unsigned VecElts = VT == MVT::v2i64 ? 4 : 2;
3650
3651         // Truncate the value (which may itself be a constant) to i32, and
3652         // convert it to a vector with movd (S2V+shuffle to zero extend).
3653         Item = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Item);
3654         Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Item);
3655         Item = getShuffleVectorZeroOrUndef(Item, 0, true,
3656                                            Subtarget->hasSSE2(), DAG);
3657
3658         // Now we have our 32-bit value zero extended in the low element of
3659         // a vector.  If Idx != 0, swizzle it into place.
3660         if (Idx != 0) {
3661           SmallVector<int, 4> Mask;
3662           Mask.push_back(Idx);
3663           for (unsigned i = 1; i != VecElts; ++i)
3664             Mask.push_back(i);
3665           Item = DAG.getVectorShuffle(VecVT, dl, Item,
3666                                       DAG.getUNDEF(Item.getValueType()),
3667                                       &Mask[0]);
3668         }
3669         return DAG.getNode(ISD::BIT_CONVERT, dl, Op.getValueType(), Item);
3670       }
3671     }
3672
3673     // If we have a constant or non-constant insertion into the low element of
3674     // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into
3675     // the rest of the elements.  This will be matched as movd/movq/movss/movsd
3676     // depending on what the source datatype is.
3677     if (Idx == 0) {
3678       if (NumZero == 0) {
3679         return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
3680       } else if (ExtVT == MVT::i32 || ExtVT == MVT::f32 || ExtVT == MVT::f64 ||
3681           (ExtVT == MVT::i64 && Subtarget->is64Bit())) {
3682         Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
3683         // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector.
3684         return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget->hasSSE2(),
3685                                            DAG);
3686       } else if (ExtVT == MVT::i16 || ExtVT == MVT::i8) {
3687         Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
3688         EVT MiddleVT = VT.getSizeInBits() == 64 ? MVT::v2i32 : MVT::v4i32;
3689         Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MiddleVT, Item);
3690         Item = getShuffleVectorZeroOrUndef(Item, 0, true,
3691                                            Subtarget->hasSSE2(), DAG);
3692         return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Item);
3693       }
3694     }
3695
3696     // Is it a vector logical left shift?
3697     if (NumElems == 2 && Idx == 1 &&
3698         X86::isZeroNode(Op.getOperand(0)) &&
3699         !X86::isZeroNode(Op.getOperand(1))) {
3700       unsigned NumBits = VT.getSizeInBits();
3701       return getVShift(true, VT,
3702                        DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
3703                                    VT, Op.getOperand(1)),
3704                        NumBits/2, DAG, *this, dl);
3705     }
3706
3707     if (IsAllConstants) // Otherwise, it's better to do a constpool load.
3708       return SDValue();
3709
3710     // Otherwise, if this is a vector with i32 or f32 elements, and the element
3711     // is a non-constant being inserted into an element other than the low one,
3712     // we can't use a constant pool load.  Instead, use SCALAR_TO_VECTOR (aka
3713     // movd/movss) to move this into the low element, then shuffle it into
3714     // place.
3715     if (EVTBits == 32) {
3716       Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
3717
3718       // Turn it into a shuffle of zero and zero-extended scalar to vector.
3719       Item = getShuffleVectorZeroOrUndef(Item, 0, NumZero > 0,
3720                                          Subtarget->hasSSE2(), DAG);
3721       SmallVector<int, 8> MaskVec;
3722       for (unsigned i = 0; i < NumElems; i++)
3723         MaskVec.push_back(i == Idx ? 0 : 1);
3724       return DAG.getVectorShuffle(VT, dl, Item, DAG.getUNDEF(VT), &MaskVec[0]);
3725     }
3726   }
3727
3728   // Splat is obviously ok. Let legalizer expand it to a shuffle.
3729   if (Values.size() == 1) {
3730     if (EVTBits == 32) {
3731       // Instead of a shuffle like this:
3732       // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
3733       // Check if it's possible to issue this instead.
3734       // shuffle (vload ptr)), undef, <1, 1, 1, 1>
3735       unsigned Idx = CountTrailingZeros_32(NonZeros);
3736       SDValue Item = Op.getOperand(Idx);
3737       if (Op.getNode()->isOnlyUserOf(Item.getNode()))
3738         return LowerAsSplatVectorLoad(Item, VT, dl, DAG);
3739     }
3740     return SDValue();
3741   }
3742
3743   // A vector full of immediates; various special cases are already
3744   // handled, so this is best done with a single constant-pool load.
3745   if (IsAllConstants)
3746     return SDValue();
3747
3748   // Let legalizer expand 2-wide build_vectors.
3749   if (EVTBits == 64) {
3750     if (NumNonZero == 1) {
3751       // One half is zero or undef.
3752       unsigned Idx = CountTrailingZeros_32(NonZeros);
3753       SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT,
3754                                  Op.getOperand(Idx));
3755       return getShuffleVectorZeroOrUndef(V2, Idx, true,
3756                                          Subtarget->hasSSE2(), DAG);
3757     }
3758     return SDValue();
3759   }
3760
3761   // If element VT is < 32 bits, convert it to inserts into a zero vector.
3762   if (EVTBits == 8 && NumElems == 16) {
3763     SDValue V = LowerBuildVectorv16i8(Op, NonZeros,NumNonZero,NumZero, DAG,
3764                                         *this);
3765     if (V.getNode()) return V;
3766   }
3767
3768   if (EVTBits == 16 && NumElems == 8) {
3769     SDValue V = LowerBuildVectorv8i16(Op, NonZeros,NumNonZero,NumZero, DAG,
3770                                         *this);
3771     if (V.getNode()) return V;
3772   }
3773
3774   // If element VT is == 32 bits, turn it into a number of shuffles.
3775   SmallVector<SDValue, 8> V;
3776   V.resize(NumElems);
3777   if (NumElems == 4 && NumZero > 0) {
3778     for (unsigned i = 0; i < 4; ++i) {
3779       bool isZero = !(NonZeros & (1 << i));
3780       if (isZero)
3781         V[i] = getZeroVector(VT, Subtarget->hasSSE2(), DAG, dl);
3782       else
3783         V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
3784     }
3785
3786     for (unsigned i = 0; i < 2; ++i) {
3787       switch ((NonZeros & (0x3 << i*2)) >> (i*2)) {
3788         default: break;
3789         case 0:
3790           V[i] = V[i*2];  // Must be a zero vector.
3791           break;
3792         case 1:
3793           V[i] = getMOVL(DAG, dl, VT, V[i*2+1], V[i*2]);
3794           break;
3795         case 2:
3796           V[i] = getMOVL(DAG, dl, VT, V[i*2], V[i*2+1]);
3797           break;
3798         case 3:
3799           V[i] = getUnpackl(DAG, dl, VT, V[i*2], V[i*2+1]);
3800           break;
3801       }
3802     }
3803
3804     SmallVector<int, 8> MaskVec;
3805     bool Reverse = (NonZeros & 0x3) == 2;
3806     for (unsigned i = 0; i < 2; ++i)
3807       MaskVec.push_back(Reverse ? 1-i : i);
3808     Reverse = ((NonZeros & (0x3 << 2)) >> 2) == 2;
3809     for (unsigned i = 0; i < 2; ++i)
3810       MaskVec.push_back(Reverse ? 1-i+NumElems : i+NumElems);
3811     return DAG.getVectorShuffle(VT, dl, V[0], V[1], &MaskVec[0]);
3812   }
3813
3814   if (Values.size() > 2) {
3815     // If we have SSE 4.1, Expand into a number of inserts unless the number of
3816     // values to be inserted is equal to the number of elements, in which case
3817     // use the unpack code below in the hopes of matching the consecutive elts
3818     // load merge pattern for shuffles.
3819     // FIXME: We could probably just check that here directly.
3820     if (Values.size() < NumElems && VT.getSizeInBits() == 128 &&
3821         getSubtarget()->hasSSE41()) {
3822       V[0] = DAG.getUNDEF(VT);
3823       for (unsigned i = 0; i < NumElems; ++i)
3824         if (Op.getOperand(i).getOpcode() != ISD::UNDEF)
3825           V[0] = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, V[0],
3826                              Op.getOperand(i), DAG.getIntPtrConstant(i));
3827       return V[0];
3828     }
3829     // Expand into a number of unpckl*.
3830     // e.g. for v4f32
3831     //   Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
3832     //         : unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
3833     //   Step 2: unpcklps X, Y ==>    <3, 2, 1, 0>
3834     for (unsigned i = 0; i < NumElems; ++i)
3835       V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
3836     NumElems >>= 1;
3837     while (NumElems != 0) {
3838       for (unsigned i = 0; i < NumElems; ++i)
3839         V[i] = getUnpackl(DAG, dl, VT, V[i], V[i + NumElems]);
3840       NumElems >>= 1;
3841     }
3842     return V[0];
3843   }
3844
3845   return SDValue();
3846 }
3847
3848 SDValue
3849 X86TargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
3850   // We support concatenate two MMX registers and place them in a MMX
3851   // register.  This is better than doing a stack convert.
3852   DebugLoc dl = Op.getDebugLoc();
3853   EVT ResVT = Op.getValueType();
3854   assert(Op.getNumOperands() == 2);
3855   assert(ResVT == MVT::v2i64 || ResVT == MVT::v4i32 ||
3856          ResVT == MVT::v8i16 || ResVT == MVT::v16i8);
3857   int Mask[2];
3858   SDValue InVec = DAG.getNode(ISD::BIT_CONVERT,dl, MVT::v1i64, Op.getOperand(0));
3859   SDValue VecOp = DAG.getNode(X86ISD::MOVQ2DQ, dl, MVT::v2i64, InVec);
3860   InVec = Op.getOperand(1);
3861   if (InVec.getOpcode() == ISD::SCALAR_TO_VECTOR) {
3862     unsigned NumElts = ResVT.getVectorNumElements();
3863     VecOp = DAG.getNode(ISD::BIT_CONVERT, dl, ResVT, VecOp);
3864     VecOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ResVT, VecOp,
3865                        InVec.getOperand(0), DAG.getIntPtrConstant(NumElts/2+1));
3866   } else {
3867     InVec = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v1i64, InVec);
3868     SDValue VecOp2 = DAG.getNode(X86ISD::MOVQ2DQ, dl, MVT::v2i64, InVec);
3869     Mask[0] = 0; Mask[1] = 2;
3870     VecOp = DAG.getVectorShuffle(MVT::v2i64, dl, VecOp, VecOp2, Mask);
3871   }
3872   return DAG.getNode(ISD::BIT_CONVERT, dl, ResVT, VecOp);
3873 }
3874
3875 // v8i16 shuffles - Prefer shuffles in the following order:
3876 // 1. [all]   pshuflw, pshufhw, optional move
3877 // 2. [ssse3] 1 x pshufb
3878 // 3. [ssse3] 2 x pshufb + 1 x por
3879 // 4. [all]   mov + pshuflw + pshufhw + N x (pextrw + pinsrw)
3880 static
3881 SDValue LowerVECTOR_SHUFFLEv8i16(ShuffleVectorSDNode *SVOp,
3882                                  SelectionDAG &DAG, X86TargetLowering &TLI) {
3883   SDValue V1 = SVOp->getOperand(0);
3884   SDValue V2 = SVOp->getOperand(1);
3885   DebugLoc dl = SVOp->getDebugLoc();
3886   SmallVector<int, 8> MaskVals;
3887
3888   // Determine if more than 1 of the words in each of the low and high quadwords
3889   // of the result come from the same quadword of one of the two inputs.  Undef
3890   // mask values count as coming from any quadword, for better codegen.
3891   SmallVector<unsigned, 4> LoQuad(4);
3892   SmallVector<unsigned, 4> HiQuad(4);
3893   BitVector InputQuads(4);
3894   for (unsigned i = 0; i < 8; ++i) {
3895     SmallVectorImpl<unsigned> &Quad = i < 4 ? LoQuad : HiQuad;
3896     int EltIdx = SVOp->getMaskElt(i);
3897     MaskVals.push_back(EltIdx);
3898     if (EltIdx < 0) {
3899       ++Quad[0];
3900       ++Quad[1];
3901       ++Quad[2];
3902       ++Quad[3];
3903       continue;
3904     }
3905     ++Quad[EltIdx / 4];
3906     InputQuads.set(EltIdx / 4);
3907   }
3908
3909   int BestLoQuad = -1;
3910   unsigned MaxQuad = 1;
3911   for (unsigned i = 0; i < 4; ++i) {
3912     if (LoQuad[i] > MaxQuad) {
3913       BestLoQuad = i;
3914       MaxQuad = LoQuad[i];
3915     }
3916   }
3917
3918   int BestHiQuad = -1;
3919   MaxQuad = 1;
3920   for (unsigned i = 0; i < 4; ++i) {
3921     if (HiQuad[i] > MaxQuad) {
3922       BestHiQuad = i;
3923       MaxQuad = HiQuad[i];
3924     }
3925   }
3926
3927   // For SSSE3, If all 8 words of the result come from only 1 quadword of each
3928   // of the two input vectors, shuffle them into one input vector so only a
3929   // single pshufb instruction is necessary. If There are more than 2 input
3930   // quads, disable the next transformation since it does not help SSSE3.
3931   bool V1Used = InputQuads[0] || InputQuads[1];
3932   bool V2Used = InputQuads[2] || InputQuads[3];
3933   if (TLI.getSubtarget()->hasSSSE3()) {
3934     if (InputQuads.count() == 2 && V1Used && V2Used) {
3935       BestLoQuad = InputQuads.find_first();
3936       BestHiQuad = InputQuads.find_next(BestLoQuad);
3937     }
3938     if (InputQuads.count() > 2) {
3939       BestLoQuad = -1;
3940       BestHiQuad = -1;
3941     }
3942   }
3943
3944   // If BestLoQuad or BestHiQuad are set, shuffle the quads together and update
3945   // the shuffle mask.  If a quad is scored as -1, that means that it contains
3946   // words from all 4 input quadwords.
3947   SDValue NewV;
3948   if (BestLoQuad >= 0 || BestHiQuad >= 0) {
3949     SmallVector<int, 8> MaskV;
3950     MaskV.push_back(BestLoQuad < 0 ? 0 : BestLoQuad);
3951     MaskV.push_back(BestHiQuad < 0 ? 1 : BestHiQuad);
3952     NewV = DAG.getVectorShuffle(MVT::v2i64, dl,
3953                   DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, V1),
3954                   DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, V2), &MaskV[0]);
3955     NewV = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, NewV);
3956
3957     // Rewrite the MaskVals and assign NewV to V1 if NewV now contains all the
3958     // source words for the shuffle, to aid later transformations.
3959     bool AllWordsInNewV = true;
3960     bool InOrder[2] = { true, true };
3961     for (unsigned i = 0; i != 8; ++i) {
3962       int idx = MaskVals[i];
3963       if (idx != (int)i)
3964         InOrder[i/4] = false;
3965       if (idx < 0 || (idx/4) == BestLoQuad || (idx/4) == BestHiQuad)
3966         continue;
3967       AllWordsInNewV = false;
3968       break;
3969     }
3970
3971     bool pshuflw = AllWordsInNewV, pshufhw = AllWordsInNewV;
3972     if (AllWordsInNewV) {
3973       for (int i = 0; i != 8; ++i) {
3974         int idx = MaskVals[i];
3975         if (idx < 0)
3976           continue;
3977         idx = MaskVals[i] = (idx / 4) == BestLoQuad ? (idx & 3) : (idx & 3) + 4;
3978         if ((idx != i) && idx < 4)
3979           pshufhw = false;
3980         if ((idx != i) && idx > 3)
3981           pshuflw = false;
3982       }
3983       V1 = NewV;
3984       V2Used = false;
3985       BestLoQuad = 0;
3986       BestHiQuad = 1;
3987     }
3988
3989     // If we've eliminated the use of V2, and the new mask is a pshuflw or
3990     // pshufhw, that's as cheap as it gets.  Return the new shuffle.
3991     if ((pshufhw && InOrder[0]) || (pshuflw && InOrder[1])) {
3992       return DAG.getVectorShuffle(MVT::v8i16, dl, NewV,
3993                                   DAG.getUNDEF(MVT::v8i16), &MaskVals[0]);
3994     }
3995   }
3996
3997   // If we have SSSE3, and all words of the result are from 1 input vector,
3998   // case 2 is generated, otherwise case 3 is generated.  If no SSSE3
3999   // is present, fall back to case 4.
4000   if (TLI.getSubtarget()->hasSSSE3()) {
4001     SmallVector<SDValue,16> pshufbMask;
4002
4003     // If we have elements from both input vectors, set the high bit of the
4004     // shuffle mask element to zero out elements that come from V2 in the V1
4005     // mask, and elements that come from V1 in the V2 mask, so that the two
4006     // results can be OR'd together.
4007     bool TwoInputs = V1Used && V2Used;
4008     for (unsigned i = 0; i != 8; ++i) {
4009       int EltIdx = MaskVals[i] * 2;
4010       if (TwoInputs && (EltIdx >= 16)) {
4011         pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
4012         pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
4013         continue;
4014       }
4015       pshufbMask.push_back(DAG.getConstant(EltIdx,   MVT::i8));
4016       pshufbMask.push_back(DAG.getConstant(EltIdx+1, MVT::i8));
4017     }
4018     V1 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v16i8, V1);
4019     V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1,
4020                      DAG.getNode(ISD::BUILD_VECTOR, dl,
4021                                  MVT::v16i8, &pshufbMask[0], 16));
4022     if (!TwoInputs)
4023       return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, V1);
4024
4025     // Calculate the shuffle mask for the second input, shuffle it, and
4026     // OR it with the first shuffled input.
4027     pshufbMask.clear();
4028     for (unsigned i = 0; i != 8; ++i) {
4029       int EltIdx = MaskVals[i] * 2;
4030       if (EltIdx < 16) {
4031         pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
4032         pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
4033         continue;
4034       }
4035       pshufbMask.push_back(DAG.getConstant(EltIdx - 16, MVT::i8));
4036       pshufbMask.push_back(DAG.getConstant(EltIdx - 15, MVT::i8));
4037     }
4038     V2 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v16i8, V2);
4039     V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2,
4040                      DAG.getNode(ISD::BUILD_VECTOR, dl,
4041                                  MVT::v16i8, &pshufbMask[0], 16));
4042     V1 = DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2);
4043     return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, V1);
4044   }
4045
4046   // If BestLoQuad >= 0, generate a pshuflw to put the low elements in order,
4047   // and update MaskVals with new element order.
4048   BitVector InOrder(8);
4049   if (BestLoQuad >= 0) {
4050     SmallVector<int, 8> MaskV;
4051     for (int i = 0; i != 4; ++i) {
4052       int idx = MaskVals[i];
4053       if (idx < 0) {
4054         MaskV.push_back(-1);
4055         InOrder.set(i);
4056       } else if ((idx / 4) == BestLoQuad) {
4057         MaskV.push_back(idx & 3);
4058         InOrder.set(i);
4059       } else {
4060         MaskV.push_back(-1);
4061       }
4062     }
4063     for (unsigned i = 4; i != 8; ++i)
4064       MaskV.push_back(i);
4065     NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16),
4066                                 &MaskV[0]);
4067   }
4068
4069   // If BestHi >= 0, generate a pshufhw to put the high elements in order,
4070   // and update MaskVals with the new element order.
4071   if (BestHiQuad >= 0) {
4072     SmallVector<int, 8> MaskV;
4073     for (unsigned i = 0; i != 4; ++i)
4074       MaskV.push_back(i);
4075     for (unsigned i = 4; i != 8; ++i) {
4076       int idx = MaskVals[i];
4077       if (idx < 0) {
4078         MaskV.push_back(-1);
4079         InOrder.set(i);
4080       } else if ((idx / 4) == BestHiQuad) {
4081         MaskV.push_back((idx & 3) + 4);
4082         InOrder.set(i);
4083       } else {
4084         MaskV.push_back(-1);
4085       }
4086     }
4087     NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16),
4088                                 &MaskV[0]);
4089   }
4090
4091   // In case BestHi & BestLo were both -1, which means each quadword has a word
4092   // from each of the four input quadwords, calculate the InOrder bitvector now
4093   // before falling through to the insert/extract cleanup.
4094   if (BestLoQuad == -1 && BestHiQuad == -1) {
4095     NewV = V1;
4096     for (int i = 0; i != 8; ++i)
4097       if (MaskVals[i] < 0 || MaskVals[i] == i)
4098         InOrder.set(i);
4099   }
4100
4101   // The other elements are put in the right place using pextrw and pinsrw.
4102   for (unsigned i = 0; i != 8; ++i) {
4103     if (InOrder[i])
4104       continue;
4105     int EltIdx = MaskVals[i];
4106     if (EltIdx < 0)
4107       continue;
4108     SDValue ExtOp = (EltIdx < 8)
4109     ? DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V1,
4110                   DAG.getIntPtrConstant(EltIdx))
4111     : DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V2,
4112                   DAG.getIntPtrConstant(EltIdx - 8));
4113     NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, ExtOp,
4114                        DAG.getIntPtrConstant(i));
4115   }
4116   return NewV;
4117 }
4118
4119 // v16i8 shuffles - Prefer shuffles in the following order:
4120 // 1. [ssse3] 1 x pshufb
4121 // 2. [ssse3] 2 x pshufb + 1 x por
4122 // 3. [all]   v8i16 shuffle + N x pextrw + rotate + pinsrw
4123 static
4124 SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp,
4125                                  SelectionDAG &DAG, X86TargetLowering &TLI) {
4126   SDValue V1 = SVOp->getOperand(0);
4127   SDValue V2 = SVOp->getOperand(1);
4128   DebugLoc dl = SVOp->getDebugLoc();
4129   SmallVector<int, 16> MaskVals;
4130   SVOp->getMask(MaskVals);
4131
4132   // If we have SSSE3, case 1 is generated when all result bytes come from
4133   // one of  the inputs.  Otherwise, case 2 is generated.  If no SSSE3 is
4134   // present, fall back to case 3.
4135   // FIXME: kill V2Only once shuffles are canonizalized by getNode.
4136   bool V1Only = true;
4137   bool V2Only = true;
4138   for (unsigned i = 0; i < 16; ++i) {
4139     int EltIdx = MaskVals[i];
4140     if (EltIdx < 0)
4141       continue;
4142     if (EltIdx < 16)
4143       V2Only = false;
4144     else
4145       V1Only = false;
4146   }
4147
4148   // If SSSE3, use 1 pshufb instruction per vector with elements in the result.
4149   if (TLI.getSubtarget()->hasSSSE3()) {
4150     SmallVector<SDValue,16> pshufbMask;
4151
4152     // If all result elements are from one input vector, then only translate
4153     // undef mask values to 0x80 (zero out result) in the pshufb mask.
4154     //
4155     // Otherwise, we have elements from both input vectors, and must zero out
4156     // elements that come from V2 in the first mask, and V1 in the second mask
4157     // so that we can OR them together.
4158     bool TwoInputs = !(V1Only || V2Only);
4159     for (unsigned i = 0; i != 16; ++i) {
4160       int EltIdx = MaskVals[i];
4161       if (EltIdx < 0 || (TwoInputs && EltIdx >= 16)) {
4162         pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
4163         continue;
4164       }
4165       pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8));
4166     }
4167     // If all the elements are from V2, assign it to V1 and return after
4168     // building the first pshufb.
4169     if (V2Only)
4170       V1 = V2;
4171     V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1,
4172                      DAG.getNode(ISD::BUILD_VECTOR, dl,
4173                                  MVT::v16i8, &pshufbMask[0], 16));
4174     if (!TwoInputs)
4175       return V1;
4176
4177     // Calculate the shuffle mask for the second input, shuffle it, and
4178     // OR it with the first shuffled input.
4179     pshufbMask.clear();
4180     for (unsigned i = 0; i != 16; ++i) {
4181       int EltIdx = MaskVals[i];
4182       if (EltIdx < 16) {
4183         pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
4184         continue;
4185       }
4186       pshufbMask.push_back(DAG.getConstant(EltIdx - 16, MVT::i8));
4187     }
4188     V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2,
4189                      DAG.getNode(ISD::BUILD_VECTOR, dl,
4190                                  MVT::v16i8, &pshufbMask[0], 16));
4191     return DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2);
4192   }
4193
4194   // No SSSE3 - Calculate in place words and then fix all out of place words
4195   // With 0-16 extracts & inserts.  Worst case is 16 bytes out of order from
4196   // the 16 different words that comprise the two doublequadword input vectors.
4197   V1 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, V1);
4198   V2 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, V2);
4199   SDValue NewV = V2Only ? V2 : V1;
4200   for (int i = 0; i != 8; ++i) {
4201     int Elt0 = MaskVals[i*2];
4202     int Elt1 = MaskVals[i*2+1];
4203
4204     // This word of the result is all undef, skip it.
4205     if (Elt0 < 0 && Elt1 < 0)
4206       continue;
4207
4208     // This word of the result is already in the correct place, skip it.
4209     if (V1Only && (Elt0 == i*2) && (Elt1 == i*2+1))
4210       continue;
4211     if (V2Only && (Elt0 == i*2+16) && (Elt1 == i*2+17))
4212       continue;
4213
4214     SDValue Elt0Src = Elt0 < 16 ? V1 : V2;
4215     SDValue Elt1Src = Elt1 < 16 ? V1 : V2;
4216     SDValue InsElt;
4217
4218     // If Elt0 and Elt1 are defined, are consecutive, and can be load
4219     // using a single extract together, load it and store it.
4220     if ((Elt0 >= 0) && ((Elt0 + 1) == Elt1) && ((Elt0 & 1) == 0)) {
4221       InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src,
4222                            DAG.getIntPtrConstant(Elt1 / 2));
4223       NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt,
4224                         DAG.getIntPtrConstant(i));
4225       continue;
4226     }
4227
4228     // If Elt1 is defined, extract it from the appropriate source.  If the
4229     // source byte is not also odd, shift the extracted word left 8 bits
4230     // otherwise clear the bottom 8 bits if we need to do an or.
4231     if (Elt1 >= 0) {
4232       InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src,
4233                            DAG.getIntPtrConstant(Elt1 / 2));
4234       if ((Elt1 & 1) == 0)
4235         InsElt = DAG.getNode(ISD::SHL, dl, MVT::i16, InsElt,
4236                              DAG.getConstant(8, TLI.getShiftAmountTy()));
4237       else if (Elt0 >= 0)
4238         InsElt = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt,
4239                              DAG.getConstant(0xFF00, MVT::i16));
4240     }
4241     // If Elt0 is defined, extract it from the appropriate source.  If the
4242     // source byte is not also even, shift the extracted word right 8 bits. If
4243     // Elt1 was also defined, OR the extracted values together before
4244     // inserting them in the result.
4245     if (Elt0 >= 0) {
4246       SDValue InsElt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16,
4247                                     Elt0Src, DAG.getIntPtrConstant(Elt0 / 2));
4248       if ((Elt0 & 1) != 0)
4249         InsElt0 = DAG.getNode(ISD::SRL, dl, MVT::i16, InsElt0,
4250                               DAG.getConstant(8, TLI.getShiftAmountTy()));
4251       else if (Elt1 >= 0)
4252         InsElt0 = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt0,
4253                              DAG.getConstant(0x00FF, MVT::i16));
4254       InsElt = Elt1 >= 0 ? DAG.getNode(ISD::OR, dl, MVT::i16, InsElt, InsElt0)
4255                          : InsElt0;
4256     }
4257     NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt,
4258                        DAG.getIntPtrConstant(i));
4259   }
4260   return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v16i8, NewV);
4261 }
4262
4263 /// RewriteAsNarrowerShuffle - Try rewriting v8i16 and v16i8 shuffles as 4 wide
4264 /// ones, or rewriting v4i32 / v2f32 as 2 wide ones if possible. This can be
4265 /// done when every pair / quad of shuffle mask elements point to elements in
4266 /// the right sequence. e.g.
4267 /// vector_shuffle <>, <>, < 3, 4, | 10, 11, | 0, 1, | 14, 15>
4268 static
4269 SDValue RewriteAsNarrowerShuffle(ShuffleVectorSDNode *SVOp,
4270                                  SelectionDAG &DAG,
4271                                  TargetLowering &TLI, DebugLoc dl) {
4272   EVT VT = SVOp->getValueType(0);
4273   SDValue V1 = SVOp->getOperand(0);
4274   SDValue V2 = SVOp->getOperand(1);
4275   unsigned NumElems = VT.getVectorNumElements();
4276   unsigned NewWidth = (NumElems == 4) ? 2 : 4;
4277   EVT MaskVT = MVT::getIntVectorWithNumElements(NewWidth);
4278   EVT MaskEltVT = MaskVT.getVectorElementType();
4279   EVT NewVT = MaskVT;
4280   switch (VT.getSimpleVT().SimpleTy) {
4281   default: assert(false && "Unexpected!");
4282   case MVT::v4f32: NewVT = MVT::v2f64; break;
4283   case MVT::v4i32: NewVT = MVT::v2i64; break;
4284   case MVT::v8i16: NewVT = MVT::v4i32; break;
4285   case MVT::v16i8: NewVT = MVT::v4i32; break;
4286   }
4287
4288   if (NewWidth == 2) {
4289     if (VT.isInteger())
4290       NewVT = MVT::v2i64;
4291     else
4292       NewVT = MVT::v2f64;
4293   }
4294   int Scale = NumElems / NewWidth;
4295   SmallVector<int, 8> MaskVec;
4296   for (unsigned i = 0; i < NumElems; i += Scale) {
4297     int StartIdx = -1;
4298     for (int j = 0; j < Scale; ++j) {
4299       int EltIdx = SVOp->getMaskElt(i+j);
4300       if (EltIdx < 0)
4301         continue;
4302       if (StartIdx == -1)
4303         StartIdx = EltIdx - (EltIdx % Scale);
4304       if (EltIdx != StartIdx + j)
4305         return SDValue();
4306     }
4307     if (StartIdx == -1)
4308       MaskVec.push_back(-1);
4309     else
4310       MaskVec.push_back(StartIdx / Scale);
4311   }
4312
4313   V1 = DAG.getNode(ISD::BIT_CONVERT, dl, NewVT, V1);
4314   V2 = DAG.getNode(ISD::BIT_CONVERT, dl, NewVT, V2);
4315   return DAG.getVectorShuffle(NewVT, dl, V1, V2, &MaskVec[0]);
4316 }
4317
4318 /// getVZextMovL - Return a zero-extending vector move low node.
4319 ///
4320 static SDValue getVZextMovL(EVT VT, EVT OpVT,
4321                             SDValue SrcOp, SelectionDAG &DAG,
4322                             const X86Subtarget *Subtarget, DebugLoc dl) {
4323   if (VT == MVT::v2f64 || VT == MVT::v4f32) {
4324     LoadSDNode *LD = NULL;
4325     if (!isScalarLoadToVector(SrcOp.getNode(), &LD))
4326       LD = dyn_cast<LoadSDNode>(SrcOp);
4327     if (!LD) {
4328       // movssrr and movsdrr do not clear top bits. Try to use movd, movq
4329       // instead.
4330       MVT ExtVT = (OpVT == MVT::v2f64) ? MVT::i64 : MVT::i32;
4331       if ((ExtVT.SimpleTy != MVT::i64 || Subtarget->is64Bit()) &&
4332           SrcOp.getOpcode() == ISD::SCALAR_TO_VECTOR &&
4333           SrcOp.getOperand(0).getOpcode() == ISD::BIT_CONVERT &&
4334           SrcOp.getOperand(0).getOperand(0).getValueType() == ExtVT) {
4335         // PR2108
4336         OpVT = (OpVT == MVT::v2f64) ? MVT::v2i64 : MVT::v4i32;
4337         return DAG.getNode(ISD::BIT_CONVERT, dl, VT,
4338                            DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT,
4339                                        DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
4340                                                    OpVT,
4341                                                    SrcOp.getOperand(0)
4342                                                           .getOperand(0))));
4343       }
4344     }
4345   }
4346
4347   return DAG.getNode(ISD::BIT_CONVERT, dl, VT,
4348                      DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT,
4349                                  DAG.getNode(ISD::BIT_CONVERT, dl,
4350                                              OpVT, SrcOp)));
4351 }
4352
4353 /// LowerVECTOR_SHUFFLE_4wide - Handle all 4 wide cases with a number of
4354 /// shuffles.
4355 static SDValue
4356 LowerVECTOR_SHUFFLE_4wide(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
4357   SDValue V1 = SVOp->getOperand(0);
4358   SDValue V2 = SVOp->getOperand(1);
4359   DebugLoc dl = SVOp->getDebugLoc();
4360   EVT VT = SVOp->getValueType(0);
4361
4362   SmallVector<std::pair<int, int>, 8> Locs;
4363   Locs.resize(4);
4364   SmallVector<int, 8> Mask1(4U, -1);
4365   SmallVector<int, 8> PermMask;
4366   SVOp->getMask(PermMask);
4367
4368   unsigned NumHi = 0;
4369   unsigned NumLo = 0;
4370   for (unsigned i = 0; i != 4; ++i) {
4371     int Idx = PermMask[i];
4372     if (Idx < 0) {
4373       Locs[i] = std::make_pair(-1, -1);
4374     } else {
4375       assert(Idx < 8 && "Invalid VECTOR_SHUFFLE index!");
4376       if (Idx < 4) {
4377         Locs[i] = std::make_pair(0, NumLo);
4378         Mask1[NumLo] = Idx;
4379         NumLo++;
4380       } else {
4381         Locs[i] = std::make_pair(1, NumHi);
4382         if (2+NumHi < 4)
4383           Mask1[2+NumHi] = Idx;
4384         NumHi++;
4385       }
4386     }
4387   }
4388
4389   if (NumLo <= 2 && NumHi <= 2) {
4390     // If no more than two elements come from either vector. This can be
4391     // implemented with two shuffles. First shuffle gather the elements.
4392     // The second shuffle, which takes the first shuffle as both of its
4393     // vector operands, put the elements into the right order.
4394     V1 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]);
4395
4396     SmallVector<int, 8> Mask2(4U, -1);
4397
4398     for (unsigned i = 0; i != 4; ++i) {
4399       if (Locs[i].first == -1)
4400         continue;
4401       else {
4402         unsigned Idx = (i < 2) ? 0 : 4;
4403         Idx += Locs[i].first * 2 + Locs[i].second;
4404         Mask2[i] = Idx;
4405       }
4406     }
4407
4408     return DAG.getVectorShuffle(VT, dl, V1, V1, &Mask2[0]);
4409   } else if (NumLo == 3 || NumHi == 3) {
4410     // Otherwise, we must have three elements from one vector, call it X, and
4411     // one element from the other, call it Y.  First, use a shufps to build an
4412     // intermediate vector with the one element from Y and the element from X
4413     // that will be in the same half in the final destination (the indexes don't
4414     // matter). Then, use a shufps to build the final vector, taking the half
4415     // containing the element from Y from the intermediate, and the other half
4416     // from X.
4417     if (NumHi == 3) {
4418       // Normalize it so the 3 elements come from V1.
4419       CommuteVectorShuffleMask(PermMask, VT);
4420       std::swap(V1, V2);
4421     }
4422
4423     // Find the element from V2.
4424     unsigned HiIndex;
4425     for (HiIndex = 0; HiIndex < 3; ++HiIndex) {
4426       int Val = PermMask[HiIndex];
4427       if (Val < 0)
4428         continue;
4429       if (Val >= 4)
4430         break;
4431     }
4432
4433     Mask1[0] = PermMask[HiIndex];
4434     Mask1[1] = -1;
4435     Mask1[2] = PermMask[HiIndex^1];
4436     Mask1[3] = -1;
4437     V2 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]);
4438
4439     if (HiIndex >= 2) {
4440       Mask1[0] = PermMask[0];
4441       Mask1[1] = PermMask[1];
4442       Mask1[2] = HiIndex & 1 ? 6 : 4;
4443       Mask1[3] = HiIndex & 1 ? 4 : 6;
4444       return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]);
4445     } else {
4446       Mask1[0] = HiIndex & 1 ? 2 : 0;
4447       Mask1[1] = HiIndex & 1 ? 0 : 2;
4448       Mask1[2] = PermMask[2];
4449       Mask1[3] = PermMask[3];
4450       if (Mask1[2] >= 0)
4451         Mask1[2] += 4;
4452       if (Mask1[3] >= 0)
4453         Mask1[3] += 4;
4454       return DAG.getVectorShuffle(VT, dl, V2, V1, &Mask1[0]);
4455     }
4456   }
4457
4458   // Break it into (shuffle shuffle_hi, shuffle_lo).
4459   Locs.clear();
4460   SmallVector<int,8> LoMask(4U, -1);
4461   SmallVector<int,8> HiMask(4U, -1);
4462
4463   SmallVector<int,8> *MaskPtr = &LoMask;
4464   unsigned MaskIdx = 0;
4465   unsigned LoIdx = 0;
4466   unsigned HiIdx = 2;
4467   for (unsigned i = 0; i != 4; ++i) {
4468     if (i == 2) {
4469       MaskPtr = &HiMask;
4470       MaskIdx = 1;
4471       LoIdx = 0;
4472       HiIdx = 2;
4473     }
4474     int Idx = PermMask[i];
4475     if (Idx < 0) {
4476       Locs[i] = std::make_pair(-1, -1);
4477     } else if (Idx < 4) {
4478       Locs[i] = std::make_pair(MaskIdx, LoIdx);
4479       (*MaskPtr)[LoIdx] = Idx;
4480       LoIdx++;
4481     } else {
4482       Locs[i] = std::make_pair(MaskIdx, HiIdx);
4483       (*MaskPtr)[HiIdx] = Idx;
4484       HiIdx++;
4485     }
4486   }
4487
4488   SDValue LoShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &LoMask[0]);
4489   SDValue HiShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &HiMask[0]);
4490   SmallVector<int, 8> MaskOps;
4491   for (unsigned i = 0; i != 4; ++i) {
4492     if (Locs[i].first == -1) {
4493       MaskOps.push_back(-1);
4494     } else {
4495       unsigned Idx = Locs[i].first * 4 + Locs[i].second;
4496       MaskOps.push_back(Idx);
4497     }
4498   }
4499   return DAG.getVectorShuffle(VT, dl, LoShuffle, HiShuffle, &MaskOps[0]);
4500 }
4501
4502 SDValue
4503 X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) {
4504   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
4505   SDValue V1 = Op.getOperand(0);
4506   SDValue V2 = Op.getOperand(1);
4507   EVT VT = Op.getValueType();
4508   DebugLoc dl = Op.getDebugLoc();
4509   unsigned NumElems = VT.getVectorNumElements();
4510   bool isMMX = VT.getSizeInBits() == 64;
4511   bool V1IsUndef = V1.getOpcode() == ISD::UNDEF;
4512   bool V2IsUndef = V2.getOpcode() == ISD::UNDEF;
4513   bool V1IsSplat = false;
4514   bool V2IsSplat = false;
4515
4516   if (isZeroShuffle(SVOp))
4517     return getZeroVector(VT, Subtarget->hasSSE2(), DAG, dl);
4518
4519   // Promote splats to v4f32.
4520   if (SVOp->isSplat()) {
4521     if (isMMX || NumElems < 4)
4522       return Op;
4523     return PromoteSplat(SVOp, DAG, Subtarget->hasSSE2());
4524   }
4525
4526   // If the shuffle can be profitably rewritten as a narrower shuffle, then
4527   // do it!
4528   if (VT == MVT::v8i16 || VT == MVT::v16i8) {
4529     SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, *this, dl);
4530     if (NewOp.getNode())
4531       return DAG.getNode(ISD::BIT_CONVERT, dl, VT,
4532                          LowerVECTOR_SHUFFLE(NewOp, DAG));
4533   } else if ((VT == MVT::v4i32 || (VT == MVT::v4f32 && Subtarget->hasSSE2()))) {
4534     // FIXME: Figure out a cleaner way to do this.
4535     // Try to make use of movq to zero out the top part.
4536     if (ISD::isBuildVectorAllZeros(V2.getNode())) {
4537       SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, *this, dl);
4538       if (NewOp.getNode()) {
4539         if (isCommutedMOVL(cast<ShuffleVectorSDNode>(NewOp), true, false))
4540           return getVZextMovL(VT, NewOp.getValueType(), NewOp.getOperand(0),
4541                               DAG, Subtarget, dl);
4542       }
4543     } else if (ISD::isBuildVectorAllZeros(V1.getNode())) {
4544       SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, *this, dl);
4545       if (NewOp.getNode() && X86::isMOVLMask(cast<ShuffleVectorSDNode>(NewOp)))
4546         return getVZextMovL(VT, NewOp.getValueType(), NewOp.getOperand(1),
4547                             DAG, Subtarget, dl);
4548     }
4549   }
4550
4551   if (X86::isPSHUFDMask(SVOp))
4552     return Op;
4553
4554   // Check if this can be converted into a logical shift.
4555   bool isLeft = false;
4556   unsigned ShAmt = 0;
4557   SDValue ShVal;
4558   bool isShift = getSubtarget()->hasSSE2() &&
4559     isVectorShift(SVOp, DAG, isLeft, ShVal, ShAmt);
4560   if (isShift && ShVal.hasOneUse()) {
4561     // If the shifted value has multiple uses, it may be cheaper to use
4562     // v_set0 + movlhps or movhlps, etc.
4563     EVT EltVT = VT.getVectorElementType();
4564     ShAmt *= EltVT.getSizeInBits();
4565     return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl);
4566   }
4567
4568   if (X86::isMOVLMask(SVOp)) {
4569     if (V1IsUndef)
4570       return V2;
4571     if (ISD::isBuildVectorAllZeros(V1.getNode()))
4572       return getVZextMovL(VT, VT, V2, DAG, Subtarget, dl);
4573     if (!isMMX)
4574       return Op;
4575   }
4576
4577   // FIXME: fold these into legal mask.
4578   if (!isMMX && (X86::isMOVSHDUPMask(SVOp) ||
4579                  X86::isMOVSLDUPMask(SVOp) ||
4580                  X86::isMOVHLPSMask(SVOp) ||
4581                  X86::isMOVLHPSMask(SVOp) ||
4582                  X86::isMOVLPMask(SVOp)))
4583     return Op;
4584
4585   if (ShouldXformToMOVHLPS(SVOp) ||
4586       ShouldXformToMOVLP(V1.getNode(), V2.getNode(), SVOp))
4587     return CommuteVectorShuffle(SVOp, DAG);
4588
4589   if (isShift) {
4590     // No better options. Use a vshl / vsrl.
4591     EVT EltVT = VT.getVectorElementType();
4592     ShAmt *= EltVT.getSizeInBits();
4593     return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl);
4594   }
4595
4596   bool Commuted = false;
4597   // FIXME: This should also accept a bitcast of a splat?  Be careful, not
4598   // 1,1,1,1 -> v8i16 though.
4599   V1IsSplat = isSplatVector(V1.getNode());
4600   V2IsSplat = isSplatVector(V2.getNode());
4601
4602   // Canonicalize the splat or undef, if present, to be on the RHS.
4603   if ((V1IsSplat || V1IsUndef) && !(V2IsSplat || V2IsUndef)) {
4604     Op = CommuteVectorShuffle(SVOp, DAG);
4605     SVOp = cast<ShuffleVectorSDNode>(Op);
4606     V1 = SVOp->getOperand(0);
4607     V2 = SVOp->getOperand(1);
4608     std::swap(V1IsSplat, V2IsSplat);
4609     std::swap(V1IsUndef, V2IsUndef);
4610     Commuted = true;
4611   }
4612
4613   if (isCommutedMOVL(SVOp, V2IsSplat, V2IsUndef)) {
4614     // Shuffling low element of v1 into undef, just return v1.
4615     if (V2IsUndef)
4616       return V1;
4617     // If V2 is a splat, the mask may be malformed such as <4,3,3,3>, which
4618     // the instruction selector will not match, so get a canonical MOVL with
4619     // swapped operands to undo the commute.
4620     return getMOVL(DAG, dl, VT, V2, V1);
4621   }
4622
4623   if (X86::isUNPCKL_v_undef_Mask(SVOp) ||
4624       X86::isUNPCKH_v_undef_Mask(SVOp) ||
4625       X86::isUNPCKLMask(SVOp) ||
4626       X86::isUNPCKHMask(SVOp))
4627     return Op;
4628
4629   if (V2IsSplat) {
4630     // Normalize mask so all entries that point to V2 points to its first
4631     // element then try to match unpck{h|l} again. If match, return a
4632     // new vector_shuffle with the corrected mask.
4633     SDValue NewMask = NormalizeMask(SVOp, DAG);
4634     ShuffleVectorSDNode *NSVOp = cast<ShuffleVectorSDNode>(NewMask);
4635     if (NSVOp != SVOp) {
4636       if (X86::isUNPCKLMask(NSVOp, true)) {
4637         return NewMask;
4638       } else if (X86::isUNPCKHMask(NSVOp, true)) {
4639         return NewMask;
4640       }
4641     }
4642   }
4643
4644   if (Commuted) {
4645     // Commute is back and try unpck* again.
4646     // FIXME: this seems wrong.
4647     SDValue NewOp = CommuteVectorShuffle(SVOp, DAG);
4648     ShuffleVectorSDNode *NewSVOp = cast<ShuffleVectorSDNode>(NewOp);
4649     if (X86::isUNPCKL_v_undef_Mask(NewSVOp) ||
4650         X86::isUNPCKH_v_undef_Mask(NewSVOp) ||
4651         X86::isUNPCKLMask(NewSVOp) ||
4652         X86::isUNPCKHMask(NewSVOp))
4653       return NewOp;
4654   }
4655
4656   // FIXME: for mmx, bitcast v2i32 to v4i16 for shuffle.
4657
4658   // Normalize the node to match x86 shuffle ops if needed
4659   if (!isMMX && V2.getOpcode() != ISD::UNDEF && isCommutedSHUFP(SVOp))
4660     return CommuteVectorShuffle(SVOp, DAG);
4661
4662   // Check for legal shuffle and return?
4663   SmallVector<int, 16> PermMask;
4664   SVOp->getMask(PermMask);
4665   if (isShuffleMaskLegal(PermMask, VT))
4666     return Op;
4667
4668   // Handle v8i16 specifically since SSE can do byte extraction and insertion.
4669   if (VT == MVT::v8i16) {
4670     SDValue NewOp = LowerVECTOR_SHUFFLEv8i16(SVOp, DAG, *this);
4671     if (NewOp.getNode())
4672       return NewOp;
4673   }
4674
4675   if (VT == MVT::v16i8) {
4676     SDValue NewOp = LowerVECTOR_SHUFFLEv16i8(SVOp, DAG, *this);
4677     if (NewOp.getNode())
4678       return NewOp;
4679   }
4680
4681   // Handle all 4 wide cases with a number of shuffles except for MMX.
4682   if (NumElems == 4 && !isMMX)
4683     return LowerVECTOR_SHUFFLE_4wide(SVOp, DAG);
4684
4685   return SDValue();
4686 }
4687
4688 SDValue
4689 X86TargetLowering::LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op,
4690                                                 SelectionDAG &DAG) {
4691   EVT VT = Op.getValueType();
4692   DebugLoc dl = Op.getDebugLoc();
4693   if (VT.getSizeInBits() == 8) {
4694     SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32,
4695                                     Op.getOperand(0), Op.getOperand(1));
4696     SDValue Assert  = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract,
4697                                     DAG.getValueType(VT));
4698     return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
4699   } else if (VT.getSizeInBits() == 16) {
4700     unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
4701     // If Idx is 0, it's cheaper to do a move instead of a pextrw.
4702     if (Idx == 0)
4703       return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
4704                          DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
4705                                      DAG.getNode(ISD::BIT_CONVERT, dl,
4706                                                  MVT::v4i32,
4707                                                  Op.getOperand(0)),
4708                                      Op.getOperand(1)));
4709     SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32,
4710                                     Op.getOperand(0), Op.getOperand(1));
4711     SDValue Assert  = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract,
4712                                     DAG.getValueType(VT));
4713     return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
4714   } else if (VT == MVT::f32) {
4715     // EXTRACTPS outputs to a GPR32 register which will require a movd to copy
4716     // the result back to FR32 register. It's only worth matching if the
4717     // result has a single use which is a store or a bitcast to i32.  And in
4718     // the case of a store, it's not worth it if the index is a constant 0,
4719     // because a MOVSSmr can be used instead, which is smaller and faster.
4720     if (!Op.hasOneUse())
4721       return SDValue();
4722     SDNode *User = *Op.getNode()->use_begin();
4723     if ((User->getOpcode() != ISD::STORE ||
4724          (isa<ConstantSDNode>(Op.getOperand(1)) &&
4725           cast<ConstantSDNode>(Op.getOperand(1))->isNullValue())) &&
4726         (User->getOpcode() != ISD::BIT_CONVERT ||
4727          User->getValueType(0) != MVT::i32))
4728       return SDValue();
4729     SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
4730                                   DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v4i32,
4731                                               Op.getOperand(0)),
4732                                               Op.getOperand(1));
4733     return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::f32, Extract);
4734   } else if (VT == MVT::i32) {
4735     // ExtractPS works with constant index.
4736     if (isa<ConstantSDNode>(Op.getOperand(1)))
4737       return Op;
4738   }
4739   return SDValue();
4740 }
4741
4742
4743 SDValue
4744 X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) {
4745   if (!isa<ConstantSDNode>(Op.getOperand(1)))
4746     return SDValue();
4747
4748   if (Subtarget->hasSSE41()) {
4749     SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG);
4750     if (Res.getNode())
4751       return Res;
4752   }
4753
4754   EVT VT = Op.getValueType();
4755   DebugLoc dl = Op.getDebugLoc();
4756   // TODO: handle v16i8.
4757   if (VT.getSizeInBits() == 16) {
4758     SDValue Vec = Op.getOperand(0);
4759     unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
4760     if (Idx == 0)
4761       return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
4762                          DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
4763                                      DAG.getNode(ISD::BIT_CONVERT, dl,
4764                                                  MVT::v4i32, Vec),
4765                                      Op.getOperand(1)));
4766     // Transform it so it match pextrw which produces a 32-bit result.
4767     EVT EltVT = MVT::i32;
4768     SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, EltVT,
4769                                     Op.getOperand(0), Op.getOperand(1));
4770     SDValue Assert  = DAG.getNode(ISD::AssertZext, dl, EltVT, Extract,
4771                                     DAG.getValueType(VT));
4772     return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
4773   } else if (VT.getSizeInBits() == 32) {
4774     unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
4775     if (Idx == 0)
4776       return Op;
4777
4778     // SHUFPS the element to the lowest double word, then movss.
4779     int Mask[4] = { Idx, -1, -1, -1 };
4780     EVT VVT = Op.getOperand(0).getValueType();
4781     SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0),
4782                                        DAG.getUNDEF(VVT), Mask);
4783     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
4784                        DAG.getIntPtrConstant(0));
4785   } else if (VT.getSizeInBits() == 64) {
4786     // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b
4787     // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught
4788     //        to match extract_elt for f64.
4789     unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
4790     if (Idx == 0)
4791       return Op;
4792
4793     // UNPCKHPD the element to the lowest double word, then movsd.
4794     // Note if the lower 64 bits of the result of the UNPCKHPD is then stored
4795     // to a f64mem, the whole operation is folded into a single MOVHPDmr.
4796     int Mask[2] = { 1, -1 };
4797     EVT VVT = Op.getOperand(0).getValueType();
4798     SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0),
4799                                        DAG.getUNDEF(VVT), Mask);
4800     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
4801                        DAG.getIntPtrConstant(0));
4802   }
4803
4804   return SDValue();
4805 }
4806
4807 SDValue
4808 X86TargetLowering::LowerINSERT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG){
4809   EVT VT = Op.getValueType();
4810   EVT EltVT = VT.getVectorElementType();
4811   DebugLoc dl = Op.getDebugLoc();
4812
4813   SDValue N0 = Op.getOperand(0);
4814   SDValue N1 = Op.getOperand(1);
4815   SDValue N2 = Op.getOperand(2);
4816
4817   if ((EltVT.getSizeInBits() == 8 || EltVT.getSizeInBits() == 16) &&
4818       isa<ConstantSDNode>(N2)) {
4819     unsigned Opc = (EltVT.getSizeInBits() == 8) ? X86ISD::PINSRB
4820                                                 : X86ISD::PINSRW;
4821     // Transform it so it match pinsr{b,w} which expects a GR32 as its second
4822     // argument.
4823     if (N1.getValueType() != MVT::i32)
4824       N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
4825     if (N2.getValueType() != MVT::i32)
4826       N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue());
4827     return DAG.getNode(Opc, dl, VT, N0, N1, N2);
4828   } else if (EltVT == MVT::f32 && isa<ConstantSDNode>(N2)) {
4829     // Bits [7:6] of the constant are the source select.  This will always be
4830     //  zero here.  The DAG Combiner may combine an extract_elt index into these
4831     //  bits.  For example (insert (extract, 3), 2) could be matched by putting
4832     //  the '3' into bits [7:6] of X86ISD::INSERTPS.
4833     // Bits [5:4] of the constant are the destination select.  This is the
4834     //  value of the incoming immediate.
4835     // Bits [3:0] of the constant are the zero mask.  The DAG Combiner may
4836     //   combine either bitwise AND or insert of float 0.0 to set these bits.
4837     N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue() << 4);
4838     // Create this as a scalar to vector..
4839     N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
4840     return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, N2);
4841   } else if (EltVT == MVT::i32 && isa<ConstantSDNode>(N2)) {
4842     // PINSR* works with constant index.
4843     return Op;
4844   }
4845   return SDValue();
4846 }
4847
4848 SDValue
4849 X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) {
4850   EVT VT = Op.getValueType();
4851   EVT EltVT = VT.getVectorElementType();
4852
4853   if (Subtarget->hasSSE41())
4854     return LowerINSERT_VECTOR_ELT_SSE4(Op, DAG);
4855
4856   if (EltVT == MVT::i8)
4857     return SDValue();
4858
4859   DebugLoc dl = Op.getDebugLoc();
4860   SDValue N0 = Op.getOperand(0);
4861   SDValue N1 = Op.getOperand(1);
4862   SDValue N2 = Op.getOperand(2);
4863
4864   if (EltVT.getSizeInBits() == 16 && isa<ConstantSDNode>(N2)) {
4865     // Transform it so it match pinsrw which expects a 16-bit value in a GR32
4866     // as its second argument.
4867     if (N1.getValueType() != MVT::i32)
4868       N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
4869     if (N2.getValueType() != MVT::i32)
4870       N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue());
4871     return DAG.getNode(X86ISD::PINSRW, dl, VT, N0, N1, N2);
4872   }
4873   return SDValue();
4874 }
4875
4876 SDValue
4877 X86TargetLowering::LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) {
4878   DebugLoc dl = Op.getDebugLoc();
4879   if (Op.getValueType() == MVT::v2f32)
4880     return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2f32,
4881                        DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i32,
4882                                    DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i32,
4883                                                Op.getOperand(0))));
4884
4885   if (Op.getValueType() == MVT::v1i64 && Op.getOperand(0).getValueType() == MVT::i64)
4886     return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i64, Op.getOperand(0));
4887
4888   SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0));
4889   EVT VT = MVT::v2i32;
4890   switch (Op.getValueType().getSimpleVT().SimpleTy) {
4891   default: break;
4892   case MVT::v16i8:
4893   case MVT::v8i16:
4894     VT = MVT::v4i32;
4895     break;
4896   }
4897   return DAG.getNode(ISD::BIT_CONVERT, dl, Op.getValueType(),
4898                      DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, AnyExt));
4899 }
4900
4901 // ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
4902 // their target countpart wrapped in the X86ISD::Wrapper node. Suppose N is
4903 // one of the above mentioned nodes. It has to be wrapped because otherwise
4904 // Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
4905 // be used to form addressing mode. These wrapped nodes will be selected
4906 // into MOV32ri.
4907 SDValue
4908 X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) {
4909   ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
4910
4911   // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
4912   // global base reg.
4913   unsigned char OpFlag = 0;
4914   unsigned WrapperKind = X86ISD::Wrapper;
4915   CodeModel::Model M = getTargetMachine().getCodeModel();
4916
4917   if (Subtarget->isPICStyleRIPRel() &&
4918       (M == CodeModel::Small || M == CodeModel::Kernel))
4919     WrapperKind = X86ISD::WrapperRIP;
4920   else if (Subtarget->isPICStyleGOT())
4921     OpFlag = X86II::MO_GOTOFF;
4922   else if (Subtarget->isPICStyleStubPIC())
4923     OpFlag = X86II::MO_PIC_BASE_OFFSET;
4924
4925   SDValue Result = DAG.getTargetConstantPool(CP->getConstVal(), getPointerTy(),
4926                                              CP->getAlignment(),
4927                                              CP->getOffset(), OpFlag);
4928   DebugLoc DL = CP->getDebugLoc();
4929   Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
4930   // With PIC, the address is actually $g + Offset.
4931   if (OpFlag) {
4932     Result = DAG.getNode(ISD::ADD, DL, getPointerTy(),
4933                          DAG.getNode(X86ISD::GlobalBaseReg,
4934                                      DebugLoc::getUnknownLoc(), getPointerTy()),
4935                          Result);
4936   }
4937
4938   return Result;
4939 }
4940
4941 SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) {
4942   JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
4943
4944   // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
4945   // global base reg.
4946   unsigned char OpFlag = 0;
4947   unsigned WrapperKind = X86ISD::Wrapper;
4948   CodeModel::Model M = getTargetMachine().getCodeModel();
4949
4950   if (Subtarget->isPICStyleRIPRel() &&
4951       (M == CodeModel::Small || M == CodeModel::Kernel))
4952     WrapperKind = X86ISD::WrapperRIP;
4953   else if (Subtarget->isPICStyleGOT())
4954     OpFlag = X86II::MO_GOTOFF;
4955   else if (Subtarget->isPICStyleStubPIC())
4956     OpFlag = X86II::MO_PIC_BASE_OFFSET;
4957
4958   SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), getPointerTy(),
4959                                           OpFlag);
4960   DebugLoc DL = JT->getDebugLoc();
4961   Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
4962
4963   // With PIC, the address is actually $g + Offset.
4964   if (OpFlag) {
4965     Result = DAG.getNode(ISD::ADD, DL, getPointerTy(),
4966                          DAG.getNode(X86ISD::GlobalBaseReg,
4967                                      DebugLoc::getUnknownLoc(), getPointerTy()),
4968                          Result);
4969   }
4970
4971   return Result;
4972 }
4973
4974 SDValue
4975 X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) {
4976   const char *Sym = cast<ExternalSymbolSDNode>(Op)->getSymbol();
4977
4978   // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
4979   // global base reg.
4980   unsigned char OpFlag = 0;
4981   unsigned WrapperKind = X86ISD::Wrapper;
4982   CodeModel::Model M = getTargetMachine().getCodeModel();
4983
4984   if (Subtarget->isPICStyleRIPRel() &&
4985       (M == CodeModel::Small || M == CodeModel::Kernel))
4986     WrapperKind = X86ISD::WrapperRIP;
4987   else if (Subtarget->isPICStyleGOT())
4988     OpFlag = X86II::MO_GOTOFF;
4989   else if (Subtarget->isPICStyleStubPIC())
4990     OpFlag = X86II::MO_PIC_BASE_OFFSET;
4991
4992   SDValue Result = DAG.getTargetExternalSymbol(Sym, getPointerTy(), OpFlag);
4993
4994   DebugLoc DL = Op.getDebugLoc();
4995   Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
4996
4997
4998   // With PIC, the address is actually $g + Offset.
4999   if (getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
5000       !Subtarget->is64Bit()) {
5001     Result = DAG.getNode(ISD::ADD, DL, getPointerTy(),
5002                          DAG.getNode(X86ISD::GlobalBaseReg,
5003                                      DebugLoc::getUnknownLoc(),
5004                                      getPointerTy()),
5005                          Result);
5006   }
5007
5008   return Result;
5009 }
5010
5011 SDValue
5012 X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) {
5013   // Create the TargetBlockAddressAddress node.
5014   unsigned char OpFlags =
5015     Subtarget->ClassifyBlockAddressReference();
5016   CodeModel::Model M = getTargetMachine().getCodeModel();
5017   BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
5018   DebugLoc dl = Op.getDebugLoc();
5019   SDValue Result = DAG.getBlockAddress(BA, getPointerTy(),
5020                                        /*isTarget=*/true, OpFlags);
5021
5022   if (Subtarget->isPICStyleRIPRel() &&
5023       (M == CodeModel::Small || M == CodeModel::Kernel))
5024     Result = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Result);
5025   else
5026     Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result);
5027
5028   // With PIC, the address is actually $g + Offset.
5029   if (isGlobalRelativeToPICBase(OpFlags)) {
5030     Result = DAG.getNode(ISD::ADD, dl, getPointerTy(),
5031                          DAG.getNode(X86ISD::GlobalBaseReg, dl, getPointerTy()),
5032                          Result);
5033   }
5034
5035   return Result;
5036 }
5037
5038 SDValue
5039 X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV, DebugLoc dl,
5040                                       int64_t Offset,
5041                                       SelectionDAG &DAG) const {
5042   // Create the TargetGlobalAddress node, folding in the constant
5043   // offset if it is legal.
5044   unsigned char OpFlags =
5045     Subtarget->ClassifyGlobalReference(GV, getTargetMachine());
5046   CodeModel::Model M = getTargetMachine().getCodeModel();
5047   SDValue Result;
5048   if (OpFlags == X86II::MO_NO_FLAG &&
5049       X86::isOffsetSuitableForCodeModel(Offset, M)) {
5050     // A direct static reference to a global.
5051     Result = DAG.getTargetGlobalAddress(GV, getPointerTy(), Offset);
5052     Offset = 0;
5053   } else {
5054     Result = DAG.getTargetGlobalAddress(GV, getPointerTy(), 0, OpFlags);
5055   }
5056
5057   if (Subtarget->isPICStyleRIPRel() &&
5058       (M == CodeModel::Small || M == CodeModel::Kernel))
5059     Result = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Result);
5060   else
5061     Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result);
5062
5063   // With PIC, the address is actually $g + Offset.
5064   if (isGlobalRelativeToPICBase(OpFlags)) {
5065     Result = DAG.getNode(ISD::ADD, dl, getPointerTy(),
5066                          DAG.getNode(X86ISD::GlobalBaseReg, dl, getPointerTy()),
5067                          Result);
5068   }
5069
5070   // For globals that require a load from a stub to get the address, emit the
5071   // load.
5072   if (isGlobalStubReference(OpFlags))
5073     Result = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Result,
5074                          PseudoSourceValue::getGOT(), 0, false, false, 0);
5075
5076   // If there was a non-zero offset that we didn't fold, create an explicit
5077   // addition for it.
5078   if (Offset != 0)
5079     Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), Result,
5080                          DAG.getConstant(Offset, getPointerTy()));
5081
5082   return Result;
5083 }
5084
5085 SDValue
5086 X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) {
5087   const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
5088   int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset();
5089   return LowerGlobalAddress(GV, Op.getDebugLoc(), Offset, DAG);
5090 }
5091
5092 static SDValue
5093 GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA,
5094            SDValue *InFlag, const EVT PtrVT, unsigned ReturnReg,
5095            unsigned char OperandFlags) {
5096   MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
5097   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Flag);
5098   DebugLoc dl = GA->getDebugLoc();
5099   SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(),
5100                                            GA->getValueType(0),
5101                                            GA->getOffset(),
5102                                            OperandFlags);
5103   if (InFlag) {
5104     SDValue Ops[] = { Chain,  TGA, *InFlag };
5105     Chain = DAG.getNode(X86ISD::TLSADDR, dl, NodeTys, Ops, 3);
5106   } else {
5107     SDValue Ops[]  = { Chain, TGA };
5108     Chain = DAG.getNode(X86ISD::TLSADDR, dl, NodeTys, Ops, 2);
5109   }
5110
5111   // TLSADDR will be codegen'ed as call. Inform MFI that function has calls.
5112   MFI->setHasCalls(true);
5113
5114   SDValue Flag = Chain.getValue(1);
5115   return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Flag);
5116 }
5117
5118 // Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit
5119 static SDValue
5120 LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG,
5121                                 const EVT PtrVT) {
5122   SDValue InFlag;
5123   DebugLoc dl = GA->getDebugLoc();  // ? function entry point might be better
5124   SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
5125                                      DAG.getNode(X86ISD::GlobalBaseReg,
5126                                                  DebugLoc::getUnknownLoc(),
5127                                                  PtrVT), InFlag);
5128   InFlag = Chain.getValue(1);
5129
5130   return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSGD);
5131 }
5132
5133 // Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit
5134 static SDValue
5135 LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG,
5136                                 const EVT PtrVT) {
5137   return GetTLSADDR(DAG, DAG.getEntryNode(), GA, NULL, PtrVT,
5138                     X86::RAX, X86II::MO_TLSGD);
5139 }
5140
5141 // Lower ISD::GlobalTLSAddress using the "initial exec" (for no-pic) or
5142 // "local exec" model.
5143 static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG,
5144                                    const EVT PtrVT, TLSModel::Model model,
5145                                    bool is64Bit) {
5146   DebugLoc dl = GA->getDebugLoc();
5147   // Get the Thread Pointer
5148   SDValue Base = DAG.getNode(X86ISD::SegmentBaseAddress,
5149                              DebugLoc::getUnknownLoc(), PtrVT,
5150                              DAG.getRegister(is64Bit? X86::FS : X86::GS,
5151                                              MVT::i32));
5152
5153   SDValue ThreadPointer = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Base,
5154                                       NULL, 0, false, false, 0);
5155
5156   unsigned char OperandFlags = 0;
5157   // Most TLS accesses are not RIP relative, even on x86-64.  One exception is
5158   // initialexec.
5159   unsigned WrapperKind = X86ISD::Wrapper;
5160   if (model == TLSModel::LocalExec) {
5161     OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF;
5162   } else if (is64Bit) {
5163     assert(model == TLSModel::InitialExec);
5164     OperandFlags = X86II::MO_GOTTPOFF;
5165     WrapperKind = X86ISD::WrapperRIP;
5166   } else {
5167     assert(model == TLSModel::InitialExec);
5168     OperandFlags = X86II::MO_INDNTPOFF;
5169   }
5170
5171   // emit "addl x@ntpoff,%eax" (local exec) or "addl x@indntpoff,%eax" (initial
5172   // exec)
5173   SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), GA->getValueType(0),
5174                                            GA->getOffset(), OperandFlags);
5175   SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
5176
5177   if (model == TLSModel::InitialExec)
5178     Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset,
5179                          PseudoSourceValue::getGOT(), 0, false, false, 0);
5180
5181   // The address of the thread local variable is the add of the thread
5182   // pointer with the offset of the variable.
5183   return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
5184 }
5185
5186 SDValue
5187 X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) {
5188   // TODO: implement the "local dynamic" model
5189   // TODO: implement the "initial exec"model for pic executables
5190   assert(Subtarget->isTargetELF() &&
5191          "TLS not implemented for non-ELF targets");
5192   GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
5193   const GlobalValue *GV = GA->getGlobal();
5194
5195   // If GV is an alias then use the aliasee for determining
5196   // thread-localness.
5197   if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV))
5198     GV = GA->resolveAliasedGlobal(false);
5199
5200   TLSModel::Model model = getTLSModel(GV,
5201                                       getTargetMachine().getRelocationModel());
5202
5203   switch (model) {
5204   case TLSModel::GeneralDynamic:
5205   case TLSModel::LocalDynamic: // not implemented
5206     if (Subtarget->is64Bit())
5207       return LowerToTLSGeneralDynamicModel64(GA, DAG, getPointerTy());
5208     return LowerToTLSGeneralDynamicModel32(GA, DAG, getPointerTy());
5209
5210   case TLSModel::InitialExec:
5211   case TLSModel::LocalExec:
5212     return LowerToTLSExecModel(GA, DAG, getPointerTy(), model,
5213                                Subtarget->is64Bit());
5214   }
5215
5216   llvm_unreachable("Unreachable");
5217   return SDValue();
5218 }
5219
5220
5221 /// LowerShift - Lower SRA_PARTS and friends, which return two i32 values and
5222 /// take a 2 x i32 value to shift plus a shift amount.
5223 SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) {
5224   assert(Op.getNumOperands() == 3 && "Not a double-shift!");
5225   EVT VT = Op.getValueType();
5226   unsigned VTBits = VT.getSizeInBits();
5227   DebugLoc dl = Op.getDebugLoc();
5228   bool isSRA = Op.getOpcode() == ISD::SRA_PARTS;
5229   SDValue ShOpLo = Op.getOperand(0);
5230   SDValue ShOpHi = Op.getOperand(1);
5231   SDValue ShAmt  = Op.getOperand(2);
5232   SDValue Tmp1 = isSRA ? DAG.getNode(ISD::SRA, dl, VT, ShOpHi,
5233                                      DAG.getConstant(VTBits - 1, MVT::i8))
5234                        : DAG.getConstant(0, VT);
5235
5236   SDValue Tmp2, Tmp3;
5237   if (Op.getOpcode() == ISD::SHL_PARTS) {
5238     Tmp2 = DAG.getNode(X86ISD::SHLD, dl, VT, ShOpHi, ShOpLo, ShAmt);
5239     Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
5240   } else {
5241     Tmp2 = DAG.getNode(X86ISD::SHRD, dl, VT, ShOpLo, ShOpHi, ShAmt);
5242     Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, dl, VT, ShOpHi, ShAmt);
5243   }
5244
5245   SDValue AndNode = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
5246                                 DAG.getConstant(VTBits, MVT::i8));
5247   SDValue Cond = DAG.getNode(X86ISD::CMP, dl, VT,
5248                              AndNode, DAG.getConstant(0, MVT::i8));
5249
5250   SDValue Hi, Lo;
5251   SDValue CC = DAG.getConstant(X86::COND_NE, MVT::i8);
5252   SDValue Ops0[4] = { Tmp2, Tmp3, CC, Cond };
5253   SDValue Ops1[4] = { Tmp3, Tmp1, CC, Cond };
5254
5255   if (Op.getOpcode() == ISD::SHL_PARTS) {
5256     Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0, 4);
5257     Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1, 4);
5258   } else {
5259     Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0, 4);
5260     Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1, 4);
5261   }
5262
5263   SDValue Ops[2] = { Lo, Hi };
5264   return DAG.getMergeValues(Ops, 2, dl);
5265 }
5266
5267 SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) {
5268   EVT SrcVT = Op.getOperand(0).getValueType();
5269
5270   if (SrcVT.isVector()) {
5271     if (SrcVT == MVT::v2i32 && Op.getValueType() == MVT::v2f64) {
5272       return Op;
5273     }
5274     return SDValue();
5275   }
5276
5277   assert(SrcVT.getSimpleVT() <= MVT::i64 && SrcVT.getSimpleVT() >= MVT::i16 &&
5278          "Unknown SINT_TO_FP to lower!");
5279
5280   // These are really Legal; return the operand so the caller accepts it as
5281   // Legal.
5282   if (SrcVT == MVT::i32 && isScalarFPTypeInSSEReg(Op.getValueType()))
5283     return Op;
5284   if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) &&
5285       Subtarget->is64Bit()) {
5286     return Op;
5287   }
5288
5289   DebugLoc dl = Op.getDebugLoc();
5290   unsigned Size = SrcVT.getSizeInBits()/8;
5291   MachineFunction &MF = DAG.getMachineFunction();
5292   int SSFI = MF.getFrameInfo()->CreateStackObject(Size, Size, false);
5293   SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
5294   SDValue Chain = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
5295                                StackSlot,
5296                                PseudoSourceValue::getFixedStack(SSFI), 0,
5297                                false, false, 0);
5298   return BuildFILD(Op, SrcVT, Chain, StackSlot, DAG);
5299 }
5300
5301 SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain,
5302                                      SDValue StackSlot,
5303                                      SelectionDAG &DAG) {
5304   // Build the FILD
5305   DebugLoc dl = Op.getDebugLoc();
5306   SDVTList Tys;
5307   bool useSSE = isScalarFPTypeInSSEReg(Op.getValueType());
5308   if (useSSE)
5309     Tys = DAG.getVTList(MVT::f64, MVT::Other, MVT::Flag);
5310   else
5311     Tys = DAG.getVTList(Op.getValueType(), MVT::Other);
5312   SDValue Ops[] = { Chain, StackSlot, DAG.getValueType(SrcVT) };
5313   SDValue Result = DAG.getNode(useSSE ? X86ISD::FILD_FLAG : X86ISD::FILD, dl,
5314                                Tys, Ops, array_lengthof(Ops));
5315
5316   if (useSSE) {
5317     Chain = Result.getValue(1);
5318     SDValue InFlag = Result.getValue(2);
5319
5320     // FIXME: Currently the FST is flagged to the FILD_FLAG. This
5321     // shouldn't be necessary except that RFP cannot be live across
5322     // multiple blocks. When stackifier is fixed, they can be uncoupled.
5323     MachineFunction &MF = DAG.getMachineFunction();
5324     int SSFI = MF.getFrameInfo()->CreateStackObject(8, 8, false);
5325     SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
5326     Tys = DAG.getVTList(MVT::Other);
5327     SDValue Ops[] = {
5328       Chain, Result, StackSlot, DAG.getValueType(Op.getValueType()), InFlag
5329     };
5330     Chain = DAG.getNode(X86ISD::FST, dl, Tys, Ops, array_lengthof(Ops));
5331     Result = DAG.getLoad(Op.getValueType(), dl, Chain, StackSlot,
5332                          PseudoSourceValue::getFixedStack(SSFI), 0,
5333                          false, false, 0);
5334   }
5335
5336   return Result;
5337 }
5338
5339 // LowerUINT_TO_FP_i64 - 64-bit unsigned integer to double expansion.
5340 SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op, SelectionDAG &DAG) {
5341   // This algorithm is not obvious. Here it is in C code, more or less:
5342   /*
5343     double uint64_to_double( uint32_t hi, uint32_t lo ) {
5344       static const __m128i exp = { 0x4330000045300000ULL, 0 };
5345       static const __m128d bias = { 0x1.0p84, 0x1.0p52 };
5346
5347       // Copy ints to xmm registers.
5348       __m128i xh = _mm_cvtsi32_si128( hi );
5349       __m128i xl = _mm_cvtsi32_si128( lo );
5350
5351       // Combine into low half of a single xmm register.
5352       __m128i x = _mm_unpacklo_epi32( xh, xl );
5353       __m128d d;
5354       double sd;
5355
5356       // Merge in appropriate exponents to give the integer bits the right
5357       // magnitude.
5358       x = _mm_unpacklo_epi32( x, exp );
5359
5360       // Subtract away the biases to deal with the IEEE-754 double precision
5361       // implicit 1.
5362       d = _mm_sub_pd( (__m128d) x, bias );
5363
5364       // All conversions up to here are exact. The correctly rounded result is
5365       // calculated using the current rounding mode using the following
5366       // horizontal add.
5367       d = _mm_add_sd( d, _mm_unpackhi_pd( d, d ) );
5368       _mm_store_sd( &sd, d );   // Because we are returning doubles in XMM, this
5369                                 // store doesn't really need to be here (except
5370                                 // maybe to zero the other double)
5371       return sd;
5372     }
5373   */
5374
5375   DebugLoc dl = Op.getDebugLoc();
5376   LLVMContext *Context = DAG.getContext();
5377
5378   // Build some magic constants.
5379   std::vector<Constant*> CV0;
5380   CV0.push_back(ConstantInt::get(*Context, APInt(32, 0x45300000)));
5381   CV0.push_back(ConstantInt::get(*Context, APInt(32, 0x43300000)));
5382   CV0.push_back(ConstantInt::get(*Context, APInt(32, 0)));
5383   CV0.push_back(ConstantInt::get(*Context, APInt(32, 0)));
5384   Constant *C0 = ConstantVector::get(CV0);
5385   SDValue CPIdx0 = DAG.getConstantPool(C0, getPointerTy(), 16);
5386
5387   std::vector<Constant*> CV1;
5388   CV1.push_back(
5389     ConstantFP::get(*Context, APFloat(APInt(64, 0x4530000000000000ULL))));
5390   CV1.push_back(
5391     ConstantFP::get(*Context, APFloat(APInt(64, 0x4330000000000000ULL))));
5392   Constant *C1 = ConstantVector::get(CV1);
5393   SDValue CPIdx1 = DAG.getConstantPool(C1, getPointerTy(), 16);
5394
5395   SDValue XR1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,
5396                             DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
5397                                         Op.getOperand(0),
5398                                         DAG.getIntPtrConstant(1)));
5399   SDValue XR2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,
5400                             DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
5401                                         Op.getOperand(0),
5402                                         DAG.getIntPtrConstant(0)));
5403   SDValue Unpck1 = getUnpackl(DAG, dl, MVT::v4i32, XR1, XR2);
5404   SDValue CLod0 = DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0,
5405                               PseudoSourceValue::getConstantPool(), 0,
5406                               false, false, 16);
5407   SDValue Unpck2 = getUnpackl(DAG, dl, MVT::v4i32, Unpck1, CLod0);
5408   SDValue XR2F = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2f64, Unpck2);
5409   SDValue CLod1 = DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1,
5410                               PseudoSourceValue::getConstantPool(), 0,
5411                               false, false, 16);
5412   SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);
5413
5414   // Add the halves; easiest way is to swap them into another reg first.
5415   int ShufMask[2] = { 1, -1 };
5416   SDValue Shuf = DAG.getVectorShuffle(MVT::v2f64, dl, Sub,
5417                                       DAG.getUNDEF(MVT::v2f64), ShufMask);
5418   SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::v2f64, Shuf, Sub);
5419   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Add,
5420                      DAG.getIntPtrConstant(0));
5421 }
5422
5423 // LowerUINT_TO_FP_i32 - 32-bit unsigned integer to float expansion.
5424 SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op, SelectionDAG &DAG) {
5425   DebugLoc dl = Op.getDebugLoc();
5426   // FP constant to bias correct the final result.
5427   SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL),
5428                                    MVT::f64);
5429
5430   // Load the 32-bit value into an XMM register.
5431   SDValue Load = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,
5432                              DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
5433                                          Op.getOperand(0),
5434                                          DAG.getIntPtrConstant(0)));
5435
5436   Load = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
5437                      DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2f64, Load),
5438                      DAG.getIntPtrConstant(0));
5439
5440   // Or the load with the bias.
5441   SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64,
5442                            DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64,
5443                                        DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
5444                                                    MVT::v2f64, Load)),
5445                            DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64,
5446                                        DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
5447                                                    MVT::v2f64, Bias)));
5448   Or = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
5449                    DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2f64, Or),
5450                    DAG.getIntPtrConstant(0));
5451
5452   // Subtract the bias.
5453   SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias);
5454
5455   // Handle final rounding.
5456   EVT DestVT = Op.getValueType();
5457
5458   if (DestVT.bitsLT(MVT::f64)) {
5459     return DAG.getNode(ISD::FP_ROUND, dl, DestVT, Sub,
5460                        DAG.getIntPtrConstant(0));
5461   } else if (DestVT.bitsGT(MVT::f64)) {
5462     return DAG.getNode(ISD::FP_EXTEND, dl, DestVT, Sub);
5463   }
5464
5465   // Handle final rounding.
5466   return Sub;
5467 }
5468
5469 SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) {
5470   SDValue N0 = Op.getOperand(0);
5471   DebugLoc dl = Op.getDebugLoc();
5472
5473   // Now not UINT_TO_FP is legal (it's marked custom), dag combiner won't
5474   // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform
5475   // the optimization here.
5476   if (DAG.SignBitIsZero(N0))
5477     return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(), N0);
5478
5479   EVT SrcVT = N0.getValueType();
5480   if (SrcVT == MVT::i64) {
5481     // We only handle SSE2 f64 target here; caller can expand the rest.
5482     if (Op.getValueType() != MVT::f64 || !X86ScalarSSEf64)
5483       return SDValue();
5484
5485     return LowerUINT_TO_FP_i64(Op, DAG);
5486   } else if (SrcVT == MVT::i32 && X86ScalarSSEf64) {
5487     return LowerUINT_TO_FP_i32(Op, DAG);
5488   }
5489
5490   assert(SrcVT == MVT::i32 && "Unknown UINT_TO_FP to lower!");
5491
5492   // Make a 64-bit buffer, and use it to build an FILD.
5493   SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64);
5494   SDValue WordOff = DAG.getConstant(4, getPointerTy());
5495   SDValue OffsetSlot = DAG.getNode(ISD::ADD, dl,
5496                                    getPointerTy(), StackSlot, WordOff);
5497   SDValue Store1 = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
5498                                 StackSlot, NULL, 0, false, false, 0);
5499   SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, MVT::i32),
5500                                 OffsetSlot, NULL, 0, false, false, 0);
5501   return BuildFILD(Op, MVT::i64, Store2, StackSlot, DAG);
5502 }
5503
5504 std::pair<SDValue,SDValue> X86TargetLowering::
5505 FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, bool IsSigned) {
5506   DebugLoc dl = Op.getDebugLoc();
5507
5508   EVT DstTy = Op.getValueType();
5509
5510   if (!IsSigned) {
5511     assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT");
5512     DstTy = MVT::i64;
5513   }
5514
5515   assert(DstTy.getSimpleVT() <= MVT::i64 &&
5516          DstTy.getSimpleVT() >= MVT::i16 &&
5517          "Unknown FP_TO_SINT to lower!");
5518
5519   // These are really Legal.
5520   if (DstTy == MVT::i32 &&
5521       isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
5522     return std::make_pair(SDValue(), SDValue());
5523   if (Subtarget->is64Bit() &&
5524       DstTy == MVT::i64 &&
5525       isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
5526     return std::make_pair(SDValue(), SDValue());
5527
5528   // We lower FP->sint64 into FISTP64, followed by a load, all to a temporary
5529   // stack slot.
5530   MachineFunction &MF = DAG.getMachineFunction();
5531   unsigned MemSize = DstTy.getSizeInBits()/8;
5532   int SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false);
5533   SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
5534
5535   unsigned Opc;
5536   switch (DstTy.getSimpleVT().SimpleTy) {
5537   default: llvm_unreachable("Invalid FP_TO_SINT to lower!");
5538   case MVT::i16: Opc = X86ISD::FP_TO_INT16_IN_MEM; break;
5539   case MVT::i32: Opc = X86ISD::FP_TO_INT32_IN_MEM; break;
5540   case MVT::i64: Opc = X86ISD::FP_TO_INT64_IN_MEM; break;
5541   }
5542
5543   SDValue Chain = DAG.getEntryNode();
5544   SDValue Value = Op.getOperand(0);
5545   if (isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType())) {
5546     assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!");
5547     Chain = DAG.getStore(Chain, dl, Value, StackSlot,
5548                          PseudoSourceValue::getFixedStack(SSFI), 0,
5549                          false, false, 0);
5550     SDVTList Tys = DAG.getVTList(Op.getOperand(0).getValueType(), MVT::Other);
5551     SDValue Ops[] = {
5552       Chain, StackSlot, DAG.getValueType(Op.getOperand(0).getValueType())
5553     };
5554     Value = DAG.getNode(X86ISD::FLD, dl, Tys, Ops, 3);
5555     Chain = Value.getValue(1);
5556     SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false);
5557     StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
5558   }
5559
5560   // Build the FP_TO_INT*_IN_MEM
5561   SDValue Ops[] = { Chain, Value, StackSlot };
5562   SDValue FIST = DAG.getNode(Opc, dl, MVT::Other, Ops, 3);
5563
5564   return std::make_pair(FIST, StackSlot);
5565 }
5566
5567 SDValue X86TargetLowering::LowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG) {
5568   if (Op.getValueType().isVector()) {
5569     if (Op.getValueType() == MVT::v2i32 &&
5570         Op.getOperand(0).getValueType() == MVT::v2f64) {
5571       return Op;
5572     }
5573     return SDValue();
5574   }
5575
5576   std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG, true);
5577   SDValue FIST = Vals.first, StackSlot = Vals.second;
5578   // If FP_TO_INTHelper failed, the node is actually supposed to be Legal.
5579   if (FIST.getNode() == 0) return Op;
5580
5581   // Load the result.
5582   return DAG.getLoad(Op.getValueType(), Op.getDebugLoc(),
5583                      FIST, StackSlot, NULL, 0, false, false, 0);
5584 }
5585
5586 SDValue X86TargetLowering::LowerFP_TO_UINT(SDValue Op, SelectionDAG &DAG) {
5587   std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG, false);
5588   SDValue FIST = Vals.first, StackSlot = Vals.second;
5589   assert(FIST.getNode() && "Unexpected failure");
5590
5591   // Load the result.
5592   return DAG.getLoad(Op.getValueType(), Op.getDebugLoc(),
5593                      FIST, StackSlot, NULL, 0, false, false, 0);
5594 }
5595
5596 SDValue X86TargetLowering::LowerFABS(SDValue Op, SelectionDAG &DAG) {
5597   LLVMContext *Context = DAG.getContext();
5598   DebugLoc dl = Op.getDebugLoc();
5599   EVT VT = Op.getValueType();
5600   EVT EltVT = VT;
5601   if (VT.isVector())
5602     EltVT = VT.getVectorElementType();
5603   std::vector<Constant*> CV;
5604   if (EltVT == MVT::f64) {
5605     Constant *C = ConstantFP::get(*Context, APFloat(APInt(64, ~(1ULL << 63))));
5606     CV.push_back(C);
5607     CV.push_back(C);
5608   } else {
5609     Constant *C = ConstantFP::get(*Context, APFloat(APInt(32, ~(1U << 31))));
5610     CV.push_back(C);
5611     CV.push_back(C);
5612     CV.push_back(C);
5613     CV.push_back(C);
5614   }
5615   Constant *C = ConstantVector::get(CV);
5616   SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16);
5617   SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
5618                              PseudoSourceValue::getConstantPool(), 0,
5619                              false, false, 16);
5620   return DAG.getNode(X86ISD::FAND, dl, VT, Op.getOperand(0), Mask);
5621 }
5622
5623 SDValue X86TargetLowering::LowerFNEG(SDValue Op, SelectionDAG &DAG) {
5624   LLVMContext *Context = DAG.getContext();
5625   DebugLoc dl = Op.getDebugLoc();
5626   EVT VT = Op.getValueType();
5627   EVT EltVT = VT;
5628   if (VT.isVector())
5629     EltVT = VT.getVectorElementType();
5630   std::vector<Constant*> CV;
5631   if (EltVT == MVT::f64) {
5632     Constant *C = ConstantFP::get(*Context, APFloat(APInt(64, 1ULL << 63)));
5633     CV.push_back(C);
5634     CV.push_back(C);
5635   } else {
5636     Constant *C = ConstantFP::get(*Context, APFloat(APInt(32, 1U << 31)));
5637     CV.push_back(C);
5638     CV.push_back(C);
5639     CV.push_back(C);
5640     CV.push_back(C);
5641   }
5642   Constant *C = ConstantVector::get(CV);
5643   SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16);
5644   SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
5645                              PseudoSourceValue::getConstantPool(), 0,
5646                              false, false, 16);
5647   if (VT.isVector()) {
5648     return DAG.getNode(ISD::BIT_CONVERT, dl, VT,
5649                        DAG.getNode(ISD::XOR, dl, MVT::v2i64,
5650                     DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64,
5651                                 Op.getOperand(0)),
5652                     DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, Mask)));
5653   } else {
5654     return DAG.getNode(X86ISD::FXOR, dl, VT, Op.getOperand(0), Mask);
5655   }
5656 }
5657
5658 SDValue X86TargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
5659   LLVMContext *Context = DAG.getContext();
5660   SDValue Op0 = Op.getOperand(0);
5661   SDValue Op1 = Op.getOperand(1);
5662   DebugLoc dl = Op.getDebugLoc();
5663   EVT VT = Op.getValueType();
5664   EVT SrcVT = Op1.getValueType();
5665
5666   // If second operand is smaller, extend it first.
5667   if (SrcVT.bitsLT(VT)) {
5668     Op1 = DAG.getNode(ISD::FP_EXTEND, dl, VT, Op1);
5669     SrcVT = VT;
5670   }
5671   // And if it is bigger, shrink it first.
5672   if (SrcVT.bitsGT(VT)) {
5673     Op1 = DAG.getNode(ISD::FP_ROUND, dl, VT, Op1, DAG.getIntPtrConstant(1));
5674     SrcVT = VT;
5675   }
5676
5677   // At this point the operands and the result should have the same
5678   // type, and that won't be f80 since that is not custom lowered.
5679
5680   // First get the sign bit of second operand.
5681   std::vector<Constant*> CV;
5682   if (SrcVT == MVT::f64) {
5683     CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, 1ULL << 63))));
5684     CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, 0))));
5685   } else {
5686     CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 1U << 31))));
5687     CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0))));
5688     CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0))));
5689     CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0))));
5690   }
5691   Constant *C = ConstantVector::get(CV);
5692   SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16);
5693   SDValue Mask1 = DAG.getLoad(SrcVT, dl, DAG.getEntryNode(), CPIdx,
5694                               PseudoSourceValue::getConstantPool(), 0,
5695                               false, false, 16);
5696   SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, SrcVT, Op1, Mask1);
5697
5698   // Shift sign bit right or left if the two operands have different types.
5699   if (SrcVT.bitsGT(VT)) {
5700     // Op0 is MVT::f32, Op1 is MVT::f64.
5701     SignBit = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, SignBit);
5702     SignBit = DAG.getNode(X86ISD::FSRL, dl, MVT::v2f64, SignBit,
5703                           DAG.getConstant(32, MVT::i32));
5704     SignBit = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v4f32, SignBit);
5705     SignBit = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, SignBit,
5706                           DAG.getIntPtrConstant(0));
5707   }
5708
5709   // Clear first operand sign bit.
5710   CV.clear();
5711   if (VT == MVT::f64) {
5712     CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, ~(1ULL << 63)))));
5713     CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, 0))));
5714   } else {
5715     CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, ~(1U << 31)))));
5716     CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0))));
5717     CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0))));
5718     CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0))));
5719   }
5720   C = ConstantVector::get(CV);
5721   CPIdx = DAG.getConstantPool(C, getPointerTy(), 16);
5722   SDValue Mask2 = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
5723                               PseudoSourceValue::getConstantPool(), 0,
5724                               false, false, 16);
5725   SDValue Val = DAG.getNode(X86ISD::FAND, dl, VT, Op0, Mask2);
5726
5727   // Or the value with the sign bit.
5728   return DAG.getNode(X86ISD::FOR, dl, VT, Val, SignBit);
5729 }
5730
5731 /// Emit nodes that will be selected as "test Op0,Op0", or something
5732 /// equivalent.
5733 SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC,
5734                                     SelectionDAG &DAG) {
5735   DebugLoc dl = Op.getDebugLoc();
5736
5737   // CF and OF aren't always set the way we want. Determine which
5738   // of these we need.
5739   bool NeedCF = false;
5740   bool NeedOF = false;
5741   switch (X86CC) {
5742   case X86::COND_A: case X86::COND_AE:
5743   case X86::COND_B: case X86::COND_BE:
5744     NeedCF = true;
5745     break;
5746   case X86::COND_G: case X86::COND_GE:
5747   case X86::COND_L: case X86::COND_LE:
5748   case X86::COND_O: case X86::COND_NO:
5749     NeedOF = true;
5750     break;
5751   default: break;
5752   }
5753
5754   // See if we can use the EFLAGS value from the operand instead of
5755   // doing a separate TEST. TEST always sets OF and CF to 0, so unless
5756   // we prove that the arithmetic won't overflow, we can't use OF or CF.
5757   if (Op.getResNo() == 0 && !NeedOF && !NeedCF) {
5758     unsigned Opcode = 0;
5759     unsigned NumOperands = 0;
5760     switch (Op.getNode()->getOpcode()) {
5761     case ISD::ADD:
5762       // Due to an isel shortcoming, be conservative if this add is likely to
5763       // be selected as part of a load-modify-store instruction. When the root
5764       // node in a match is a store, isel doesn't know how to remap non-chain
5765       // non-flag uses of other nodes in the match, such as the ADD in this
5766       // case. This leads to the ADD being left around and reselected, with
5767       // the result being two adds in the output.
5768       for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
5769            UE = Op.getNode()->use_end(); UI != UE; ++UI)
5770         if (UI->getOpcode() == ISD::STORE)
5771           goto default_case;
5772       if (ConstantSDNode *C =
5773             dyn_cast<ConstantSDNode>(Op.getNode()->getOperand(1))) {
5774         // An add of one will be selected as an INC.
5775         if (C->getAPIntValue() == 1) {
5776           Opcode = X86ISD::INC;
5777           NumOperands = 1;
5778           break;
5779         }
5780         // An add of negative one (subtract of one) will be selected as a DEC.
5781         if (C->getAPIntValue().isAllOnesValue()) {
5782           Opcode = X86ISD::DEC;
5783           NumOperands = 1;
5784           break;
5785         }
5786       }
5787       // Otherwise use a regular EFLAGS-setting add.
5788       Opcode = X86ISD::ADD;
5789       NumOperands = 2;
5790       break;
5791     case ISD::AND: {
5792       // If the primary and result isn't used, don't bother using X86ISD::AND,
5793       // because a TEST instruction will be better.
5794       bool NonFlagUse = false;
5795       for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
5796              UE = Op.getNode()->use_end(); UI != UE; ++UI) {
5797         SDNode *User = *UI;
5798         unsigned UOpNo = UI.getOperandNo();
5799         if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) {
5800           // Look pass truncate.
5801           UOpNo = User->use_begin().getOperandNo();
5802           User = *User->use_begin();
5803         }
5804         if (User->getOpcode() != ISD::BRCOND &&
5805             User->getOpcode() != ISD::SETCC &&
5806             (User->getOpcode() != ISD::SELECT || UOpNo != 0)) {
5807           NonFlagUse = true;
5808           break;
5809         }
5810       }
5811       if (!NonFlagUse)
5812         break;
5813     }
5814     // FALL THROUGH
5815     case ISD::SUB:
5816     case ISD::OR:
5817     case ISD::XOR:
5818       // Due to the ISEL shortcoming noted above, be conservative if this op is
5819       // likely to be selected as part of a load-modify-store instruction.
5820       for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
5821            UE = Op.getNode()->use_end(); UI != UE; ++UI)
5822         if (UI->getOpcode() == ISD::STORE)
5823           goto default_case;
5824       // Otherwise use a regular EFLAGS-setting instruction.
5825       switch (Op.getNode()->getOpcode()) {
5826       case ISD::SUB: Opcode = X86ISD::SUB; break;
5827       case ISD::OR:  Opcode = X86ISD::OR;  break;
5828       case ISD::XOR: Opcode = X86ISD::XOR; break;
5829       case ISD::AND: Opcode = X86ISD::AND; break;
5830       default: llvm_unreachable("unexpected operator!");
5831       }
5832       NumOperands = 2;
5833       break;
5834     case X86ISD::ADD:
5835     case X86ISD::SUB:
5836     case X86ISD::INC:
5837     case X86ISD::DEC:
5838     case X86ISD::OR:
5839     case X86ISD::XOR:
5840     case X86ISD::AND:
5841       return SDValue(Op.getNode(), 1);
5842     default:
5843     default_case:
5844       break;
5845     }
5846     if (Opcode != 0) {
5847       SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
5848       SmallVector<SDValue, 4> Ops;
5849       for (unsigned i = 0; i != NumOperands; ++i)
5850         Ops.push_back(Op.getOperand(i));
5851       SDValue New = DAG.getNode(Opcode, dl, VTs, &Ops[0], NumOperands);
5852       DAG.ReplaceAllUsesWith(Op, New);
5853       return SDValue(New.getNode(), 1);
5854     }
5855   }
5856
5857   // Otherwise just emit a CMP with 0, which is the TEST pattern.
5858   return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
5859                      DAG.getConstant(0, Op.getValueType()));
5860 }
5861
5862 /// Emit nodes that will be selected as "cmp Op0,Op1", or something
5863 /// equivalent.
5864 SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
5865                                    SelectionDAG &DAG) {
5866   if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op1))
5867     if (C->getAPIntValue() == 0)
5868       return EmitTest(Op0, X86CC, DAG);
5869
5870   DebugLoc dl = Op0.getDebugLoc();
5871   return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1);
5872 }
5873
5874 /// LowerToBT - Result of 'and' is compared against zero. Turn it into a BT node
5875 /// if it's possible.
5876 static SDValue LowerToBT(SDValue Op0, ISD::CondCode CC,
5877                          DebugLoc dl, SelectionDAG &DAG) {
5878   SDValue LHS, RHS;
5879   if (Op0.getOperand(1).getOpcode() == ISD::SHL) {
5880     if (ConstantSDNode *Op010C =
5881         dyn_cast<ConstantSDNode>(Op0.getOperand(1).getOperand(0)))
5882       if (Op010C->getZExtValue() == 1) {
5883         LHS = Op0.getOperand(0);
5884         RHS = Op0.getOperand(1).getOperand(1);
5885       }
5886   } else if (Op0.getOperand(0).getOpcode() == ISD::SHL) {
5887     if (ConstantSDNode *Op000C =
5888         dyn_cast<ConstantSDNode>(Op0.getOperand(0).getOperand(0)))
5889       if (Op000C->getZExtValue() == 1) {
5890         LHS = Op0.getOperand(1);
5891         RHS = Op0.getOperand(0).getOperand(1);
5892       }
5893   } else if (Op0.getOperand(1).getOpcode() == ISD::Constant) {
5894     ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op0.getOperand(1));
5895     SDValue AndLHS = Op0.getOperand(0);
5896     if (AndRHS->getZExtValue() == 1 && AndLHS.getOpcode() == ISD::SRL) {
5897       LHS = AndLHS.getOperand(0);
5898       RHS = AndLHS.getOperand(1);
5899     }
5900   }
5901
5902   if (LHS.getNode()) {
5903     // If LHS is i8, promote it to i16 with any_extend.  There is no i8 BT
5904     // instruction.  Since the shift amount is in-range-or-undefined, we know
5905     // that doing a bittest on the i16 value is ok.  We extend to i32 because
5906     // the encoding for the i16 version is larger than the i32 version.
5907     if (LHS.getValueType() == MVT::i8)
5908       LHS = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, LHS);
5909
5910     // If the operand types disagree, extend the shift amount to match.  Since
5911     // BT ignores high bits (like shifts) we can use anyextend.
5912     if (LHS.getValueType() != RHS.getValueType())
5913       RHS = DAG.getNode(ISD::ANY_EXTEND, dl, LHS.getValueType(), RHS);
5914
5915     SDValue BT = DAG.getNode(X86ISD::BT, dl, MVT::i32, LHS, RHS);
5916     unsigned Cond = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;
5917     return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
5918                        DAG.getConstant(Cond, MVT::i8), BT);
5919   }
5920
5921   return SDValue();
5922 }
5923
5924 SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) {
5925   assert(Op.getValueType() == MVT::i8 && "SetCC type must be 8-bit integer");
5926   SDValue Op0 = Op.getOperand(0);
5927   SDValue Op1 = Op.getOperand(1);
5928   DebugLoc dl = Op.getDebugLoc();
5929   ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
5930
5931   // Optimize to BT if possible.
5932   // Lower (X & (1 << N)) == 0 to BT(X, N).
5933   // Lower ((X >>u N) & 1) != 0 to BT(X, N).
5934   // Lower ((X >>s N) & 1) != 0 to BT(X, N).
5935   if (Op0.getOpcode() == ISD::AND &&
5936       Op0.hasOneUse() &&
5937       Op1.getOpcode() == ISD::Constant &&
5938       cast<ConstantSDNode>(Op1)->getZExtValue() == 0 &&
5939       (CC == ISD::SETEQ || CC == ISD::SETNE)) {
5940     SDValue NewSetCC = LowerToBT(Op0, CC, dl, DAG);
5941     if (NewSetCC.getNode())
5942       return NewSetCC;
5943   }
5944
5945   bool isFP = Op.getOperand(1).getValueType().isFloatingPoint();
5946   unsigned X86CC = TranslateX86CC(CC, isFP, Op0, Op1, DAG);
5947   if (X86CC == X86::COND_INVALID)
5948     return SDValue();
5949
5950   SDValue Cond = EmitCmp(Op0, Op1, X86CC, DAG);
5951
5952   // Use sbb x, x to materialize carry bit into a GPR.
5953   if (X86CC == X86::COND_B)
5954     return DAG.getNode(ISD::AND, dl, MVT::i8,
5955                        DAG.getNode(X86ISD::SETCC_CARRY, dl, MVT::i8,
5956                                    DAG.getConstant(X86CC, MVT::i8), Cond),
5957                        DAG.getConstant(1, MVT::i8));
5958
5959   return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
5960                      DAG.getConstant(X86CC, MVT::i8), Cond);
5961 }
5962
5963 SDValue X86TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) {
5964   SDValue Cond;
5965   SDValue Op0 = Op.getOperand(0);
5966   SDValue Op1 = Op.getOperand(1);
5967   SDValue CC = Op.getOperand(2);
5968   EVT VT = Op.getValueType();
5969   ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
5970   bool isFP = Op.getOperand(1).getValueType().isFloatingPoint();
5971   DebugLoc dl = Op.getDebugLoc();
5972
5973   if (isFP) {
5974     unsigned SSECC = 8;
5975     EVT VT0 = Op0.getValueType();
5976     assert(VT0 == MVT::v4f32 || VT0 == MVT::v2f64);
5977     unsigned Opc = VT0 == MVT::v4f32 ? X86ISD::CMPPS : X86ISD::CMPPD;
5978     bool Swap = false;
5979
5980     switch (SetCCOpcode) {
5981     default: break;
5982     case ISD::SETOEQ:
5983     case ISD::SETEQ:  SSECC = 0; break;
5984     case ISD::SETOGT:
5985     case ISD::SETGT: Swap = true; // Fallthrough
5986     case ISD::SETLT:
5987     case ISD::SETOLT: SSECC = 1; break;
5988     case ISD::SETOGE:
5989     case ISD::SETGE: Swap = true; // Fallthrough
5990     case ISD::SETLE:
5991     case ISD::SETOLE: SSECC = 2; break;
5992     case ISD::SETUO:  SSECC = 3; break;
5993     case ISD::SETUNE:
5994     case ISD::SETNE:  SSECC = 4; break;
5995     case ISD::SETULE: Swap = true;
5996     case ISD::SETUGE: SSECC = 5; break;
5997     case ISD::SETULT: Swap = true;
5998     case ISD::SETUGT: SSECC = 6; break;
5999     case ISD::SETO:   SSECC = 7; break;
6000     }
6001     if (Swap)
6002       std::swap(Op0, Op1);
6003
6004     // In the two special cases we can't handle, emit two comparisons.
6005     if (SSECC == 8) {
6006       if (SetCCOpcode == ISD::SETUEQ) {
6007         SDValue UNORD, EQ;
6008         UNORD = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(3, MVT::i8));
6009         EQ = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(0, MVT::i8));
6010         return DAG.getNode(ISD::OR, dl, VT, UNORD, EQ);
6011       }
6012       else if (SetCCOpcode == ISD::SETONE) {
6013         SDValue ORD, NEQ;
6014         ORD = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(7, MVT::i8));
6015         NEQ = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(4, MVT::i8));
6016         return DAG.getNode(ISD::AND, dl, VT, ORD, NEQ);
6017       }
6018       llvm_unreachable("Illegal FP comparison");
6019     }
6020     // Handle all other FP comparisons here.
6021     return DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(SSECC, MVT::i8));
6022   }
6023
6024   // We are handling one of the integer comparisons here.  Since SSE only has
6025   // GT and EQ comparisons for integer, swapping operands and multiple
6026   // operations may be required for some comparisons.
6027   unsigned Opc = 0, EQOpc = 0, GTOpc = 0;
6028   bool Swap = false, Invert = false, FlipSigns = false;
6029
6030   switch (VT.getSimpleVT().SimpleTy) {
6031   default: break;
6032   case MVT::v8i8:
6033   case MVT::v16i8: EQOpc = X86ISD::PCMPEQB; GTOpc = X86ISD::PCMPGTB; break;
6034   case MVT::v4i16:
6035   case MVT::v8i16: EQOpc = X86ISD::PCMPEQW; GTOpc = X86ISD::PCMPGTW; break;
6036   case MVT::v2i32:
6037   case MVT::v4i32: EQOpc = X86ISD::PCMPEQD; GTOpc = X86ISD::PCMPGTD; break;
6038   case MVT::v2i64: EQOpc = X86ISD::PCMPEQQ; GTOpc = X86ISD::PCMPGTQ; break;
6039   }
6040
6041   switch (SetCCOpcode) {
6042   default: break;
6043   case ISD::SETNE:  Invert = true;
6044   case ISD::SETEQ:  Opc = EQOpc; break;
6045   case ISD::SETLT:  Swap = true;
6046   case ISD::SETGT:  Opc = GTOpc; break;
6047   case ISD::SETGE:  Swap = true;
6048   case ISD::SETLE:  Opc = GTOpc; Invert = true; break;
6049   case ISD::SETULT: Swap = true;
6050   case ISD::SETUGT: Opc = GTOpc; FlipSigns = true; break;
6051   case ISD::SETUGE: Swap = true;
6052   case ISD::SETULE: Opc = GTOpc; FlipSigns = true; Invert = true; break;
6053   }
6054   if (Swap)
6055     std::swap(Op0, Op1);
6056
6057   // Since SSE has no unsigned integer comparisons, we need to flip  the sign
6058   // bits of the inputs before performing those operations.
6059   if (FlipSigns) {
6060     EVT EltVT = VT.getVectorElementType();
6061     SDValue SignBit = DAG.getConstant(APInt::getSignBit(EltVT.getSizeInBits()),
6062                                       EltVT);
6063     std::vector<SDValue> SignBits(VT.getVectorNumElements(), SignBit);
6064     SDValue SignVec = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &SignBits[0],
6065                                     SignBits.size());
6066     Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SignVec);
6067     Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SignVec);
6068   }
6069
6070   SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
6071
6072   // If the logical-not of the result is required, perform that now.
6073   if (Invert)
6074     Result = DAG.getNOT(dl, Result, VT);
6075
6076   return Result;
6077 }
6078
6079 // isX86LogicalCmp - Return true if opcode is a X86 logical comparison.
6080 static bool isX86LogicalCmp(SDValue Op) {
6081   unsigned Opc = Op.getNode()->getOpcode();
6082   if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI)
6083     return true;
6084   if (Op.getResNo() == 1 &&
6085       (Opc == X86ISD::ADD ||
6086        Opc == X86ISD::SUB ||
6087        Opc == X86ISD::SMUL ||
6088        Opc == X86ISD::UMUL ||
6089        Opc == X86ISD::INC ||
6090        Opc == X86ISD::DEC ||
6091        Opc == X86ISD::OR ||
6092        Opc == X86ISD::XOR ||
6093        Opc == X86ISD::AND))
6094     return true;
6095
6096   return false;
6097 }
6098
6099 SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) {
6100   bool addTest = true;
6101   SDValue Cond  = Op.getOperand(0);
6102   DebugLoc dl = Op.getDebugLoc();
6103   SDValue CC;
6104
6105   if (Cond.getOpcode() == ISD::SETCC) {
6106     SDValue NewCond = LowerSETCC(Cond, DAG);
6107     if (NewCond.getNode())
6108       Cond = NewCond;
6109   }
6110
6111   // (select (x == 0), -1, 0) -> (sign_bit (x - 1))
6112   SDValue Op1 = Op.getOperand(1);
6113   SDValue Op2 = Op.getOperand(2);
6114   if (Cond.getOpcode() == X86ISD::SETCC &&
6115       cast<ConstantSDNode>(Cond.getOperand(0))->getZExtValue() == X86::COND_E) {
6116     SDValue Cmp = Cond.getOperand(1);
6117     if (Cmp.getOpcode() == X86ISD::CMP) {
6118       ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(Op1);
6119       ConstantSDNode *N2C = dyn_cast<ConstantSDNode>(Op2);
6120       ConstantSDNode *RHSC =
6121         dyn_cast<ConstantSDNode>(Cmp.getOperand(1).getNode());
6122       if (N1C && N1C->isAllOnesValue() &&
6123           N2C && N2C->isNullValue() &&
6124           RHSC && RHSC->isNullValue()) {
6125         SDValue CmpOp0 = Cmp.getOperand(0);
6126         Cmp = DAG.getNode(X86ISD::CMP, dl, CmpOp0.getValueType(),
6127                           CmpOp0, DAG.getConstant(1, CmpOp0.getValueType()));
6128         return DAG.getNode(X86ISD::SETCC_CARRY, dl, Op.getValueType(),
6129                            DAG.getConstant(X86::COND_B, MVT::i8), Cmp);
6130       }
6131     }
6132   }
6133
6134   // Look pass (and (setcc_carry (cmp ...)), 1).
6135   if (Cond.getOpcode() == ISD::AND &&
6136       Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
6137     ConstantSDNode *C = dyn_cast<ConstantSDNode>(Cond.getOperand(1));
6138     if (C && C->getAPIntValue() == 1)
6139       Cond = Cond.getOperand(0);
6140   }
6141
6142   // If condition flag is set by a X86ISD::CMP, then use it as the condition
6143   // setting operand in place of the X86ISD::SETCC.
6144   if (Cond.getOpcode() == X86ISD::SETCC ||
6145       Cond.getOpcode() == X86ISD::SETCC_CARRY) {
6146     CC = Cond.getOperand(0);
6147
6148     SDValue Cmp = Cond.getOperand(1);
6149     unsigned Opc = Cmp.getOpcode();
6150     EVT VT = Op.getValueType();
6151
6152     bool IllegalFPCMov = false;
6153     if (VT.isFloatingPoint() && !VT.isVector() &&
6154         !isScalarFPTypeInSSEReg(VT))  // FPStack?
6155       IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue());
6156
6157     if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) ||
6158         Opc == X86ISD::BT) { // FIXME
6159       Cond = Cmp;
6160       addTest = false;
6161     }
6162   }
6163
6164   if (addTest) {
6165     // Look pass the truncate.
6166     if (Cond.getOpcode() == ISD::TRUNCATE)
6167       Cond = Cond.getOperand(0);
6168
6169     // We know the result of AND is compared against zero. Try to match
6170     // it to BT.
6171     if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
6172       SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, dl, DAG);
6173       if (NewSetCC.getNode()) {
6174         CC = NewSetCC.getOperand(0);
6175         Cond = NewSetCC.getOperand(1);
6176         addTest = false;
6177       }
6178     }
6179   }
6180
6181   if (addTest) {
6182     CC = DAG.getConstant(X86::COND_NE, MVT::i8);
6183     Cond = EmitTest(Cond, X86::COND_NE, DAG);
6184   }
6185
6186   // X86ISD::CMOV means set the result (which is operand 1) to the RHS if
6187   // condition is true.
6188   SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Flag);
6189   SDValue Ops[] = { Op2, Op1, CC, Cond };
6190   return DAG.getNode(X86ISD::CMOV, dl, VTs, Ops, array_lengthof(Ops));
6191 }
6192
6193 // isAndOrOfSingleUseSetCCs - Return true if node is an ISD::AND or
6194 // ISD::OR of two X86ISD::SETCC nodes each of which has no other use apart
6195 // from the AND / OR.
6196 static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) {
6197   Opc = Op.getOpcode();
6198   if (Opc != ISD::OR && Opc != ISD::AND)
6199     return false;
6200   return (Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
6201           Op.getOperand(0).hasOneUse() &&
6202           Op.getOperand(1).getOpcode() == X86ISD::SETCC &&
6203           Op.getOperand(1).hasOneUse());
6204 }
6205
6206 // isXor1OfSetCC - Return true if node is an ISD::XOR of a X86ISD::SETCC and
6207 // 1 and that the SETCC node has a single use.
6208 static bool isXor1OfSetCC(SDValue Op) {
6209   if (Op.getOpcode() != ISD::XOR)
6210     return false;
6211   ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
6212   if (N1C && N1C->getAPIntValue() == 1) {
6213     return Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
6214       Op.getOperand(0).hasOneUse();
6215   }
6216   return false;
6217 }
6218
6219 SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) {
6220   bool addTest = true;
6221   SDValue Chain = Op.getOperand(0);
6222   SDValue Cond  = Op.getOperand(1);
6223   SDValue Dest  = Op.getOperand(2);
6224   DebugLoc dl = Op.getDebugLoc();
6225   SDValue CC;
6226
6227   if (Cond.getOpcode() == ISD::SETCC) {
6228     SDValue NewCond = LowerSETCC(Cond, DAG);
6229     if (NewCond.getNode())
6230       Cond = NewCond;
6231   }
6232 #if 0
6233   // FIXME: LowerXALUO doesn't handle these!!
6234   else if (Cond.getOpcode() == X86ISD::ADD  ||
6235            Cond.getOpcode() == X86ISD::SUB  ||
6236            Cond.getOpcode() == X86ISD::SMUL ||
6237            Cond.getOpcode() == X86ISD::UMUL)
6238     Cond = LowerXALUO(Cond, DAG);
6239 #endif
6240
6241   // Look pass (and (setcc_carry (cmp ...)), 1).
6242   if (Cond.getOpcode() == ISD::AND &&
6243       Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
6244     ConstantSDNode *C = dyn_cast<ConstantSDNode>(Cond.getOperand(1));
6245     if (C && C->getAPIntValue() == 1)
6246       Cond = Cond.getOperand(0);
6247   }
6248
6249   // If condition flag is set by a X86ISD::CMP, then use it as the condition
6250   // setting operand in place of the X86ISD::SETCC.
6251   if (Cond.getOpcode() == X86ISD::SETCC ||
6252       Cond.getOpcode() == X86ISD::SETCC_CARRY) {
6253     CC = Cond.getOperand(0);
6254
6255     SDValue Cmp = Cond.getOperand(1);
6256     unsigned Opc = Cmp.getOpcode();
6257     // FIXME: WHY THE SPECIAL CASING OF LogicalCmp??
6258     if (isX86LogicalCmp(Cmp) || Opc == X86ISD::BT) {
6259       Cond = Cmp;
6260       addTest = false;
6261     } else {
6262       switch (cast<ConstantSDNode>(CC)->getZExtValue()) {
6263       default: break;
6264       case X86::COND_O:
6265       case X86::COND_B:
6266         // These can only come from an arithmetic instruction with overflow,
6267         // e.g. SADDO, UADDO.
6268         Cond = Cond.getNode()->getOperand(1);
6269         addTest = false;
6270         break;
6271       }
6272     }
6273   } else {
6274     unsigned CondOpc;
6275     if (Cond.hasOneUse() && isAndOrOfSetCCs(Cond, CondOpc)) {
6276       SDValue Cmp = Cond.getOperand(0).getOperand(1);
6277       if (CondOpc == ISD::OR) {
6278         // Also, recognize the pattern generated by an FCMP_UNE. We can emit
6279         // two branches instead of an explicit OR instruction with a
6280         // separate test.
6281         if (Cmp == Cond.getOperand(1).getOperand(1) &&
6282             isX86LogicalCmp(Cmp)) {
6283           CC = Cond.getOperand(0).getOperand(0);
6284           Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
6285                               Chain, Dest, CC, Cmp);
6286           CC = Cond.getOperand(1).getOperand(0);
6287           Cond = Cmp;
6288           addTest = false;
6289         }
6290       } else { // ISD::AND
6291         // Also, recognize the pattern generated by an FCMP_OEQ. We can emit
6292         // two branches instead of an explicit AND instruction with a
6293         // separate test. However, we only do this if this block doesn't
6294         // have a fall-through edge, because this requires an explicit
6295         // jmp when the condition is false.
6296         if (Cmp == Cond.getOperand(1).getOperand(1) &&
6297             isX86LogicalCmp(Cmp) &&
6298             Op.getNode()->hasOneUse()) {
6299           X86::CondCode CCode =
6300             (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
6301           CCode = X86::GetOppositeBranchCondition(CCode);
6302           CC = DAG.getConstant(CCode, MVT::i8);
6303           SDValue User = SDValue(*Op.getNode()->use_begin(), 0);
6304           // Look for an unconditional branch following this conditional branch.
6305           // We need this because we need to reverse the successors in order
6306           // to implement FCMP_OEQ.
6307           if (User.getOpcode() == ISD::BR) {
6308             SDValue FalseBB = User.getOperand(1);
6309             SDValue NewBR =
6310               DAG.UpdateNodeOperands(User, User.getOperand(0), Dest);
6311             assert(NewBR == User);
6312             Dest = FalseBB;
6313
6314             Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
6315                                 Chain, Dest, CC, Cmp);
6316             X86::CondCode CCode =
6317               (X86::CondCode)Cond.getOperand(1).getConstantOperandVal(0);
6318             CCode = X86::GetOppositeBranchCondition(CCode);
6319             CC = DAG.getConstant(CCode, MVT::i8);
6320             Cond = Cmp;
6321             addTest = false;
6322           }
6323         }
6324       }
6325     } else if (Cond.hasOneUse() && isXor1OfSetCC(Cond)) {
6326       // Recognize for xorb (setcc), 1 patterns. The xor inverts the condition.
6327       // It should be transformed during dag combiner except when the condition
6328       // is set by a arithmetics with overflow node.
6329       X86::CondCode CCode =
6330         (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
6331       CCode = X86::GetOppositeBranchCondition(CCode);
6332       CC = DAG.getConstant(CCode, MVT::i8);
6333       Cond = Cond.getOperand(0).getOperand(1);
6334       addTest = false;
6335     }
6336   }
6337
6338   if (addTest) {
6339     // Look pass the truncate.
6340     if (Cond.getOpcode() == ISD::TRUNCATE)
6341       Cond = Cond.getOperand(0);
6342
6343     // We know the result of AND is compared against zero. Try to match
6344     // it to BT.
6345     if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
6346       SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, dl, DAG);
6347       if (NewSetCC.getNode()) {
6348         CC = NewSetCC.getOperand(0);
6349         Cond = NewSetCC.getOperand(1);
6350         addTest = false;
6351       }
6352     }
6353   }
6354
6355   if (addTest) {
6356     CC = DAG.getConstant(X86::COND_NE, MVT::i8);
6357     Cond = EmitTest(Cond, X86::COND_NE, DAG);
6358   }
6359   return DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
6360                      Chain, Dest, CC, Cond);
6361 }
6362
6363
6364 // Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets.
6365 // Calls to _alloca is needed to probe the stack when allocating more than 4k
6366 // bytes in one go. Touching the stack at 4K increments is necessary to ensure
6367 // that the guard pages used by the OS virtual memory manager are allocated in
6368 // correct sequence.
6369 SDValue
6370 X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
6371                                            SelectionDAG &DAG) {
6372   assert(Subtarget->isTargetCygMing() &&
6373          "This should be used only on Cygwin/Mingw targets");
6374   DebugLoc dl = Op.getDebugLoc();
6375
6376   // Get the inputs.
6377   SDValue Chain = Op.getOperand(0);
6378   SDValue Size  = Op.getOperand(1);
6379   // FIXME: Ensure alignment here
6380
6381   SDValue Flag;
6382
6383   EVT IntPtr = getPointerTy();
6384   EVT SPTy = Subtarget->is64Bit() ? MVT::i64 : MVT::i32;
6385
6386   Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(0, true));
6387
6388   Chain = DAG.getCopyToReg(Chain, dl, X86::EAX, Size, Flag);
6389   Flag = Chain.getValue(1);
6390
6391   SDVTList  NodeTys = DAG.getVTList(MVT::Other, MVT::Flag);
6392   SDValue Ops[] = { Chain,
6393                       DAG.getTargetExternalSymbol("_alloca", IntPtr),
6394                       DAG.getRegister(X86::EAX, IntPtr),
6395                       DAG.getRegister(X86StackPtr, SPTy),
6396                       Flag };
6397   Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops, 5);
6398   Flag = Chain.getValue(1);
6399
6400   Chain = DAG.getCALLSEQ_END(Chain,
6401                              DAG.getIntPtrConstant(0, true),
6402                              DAG.getIntPtrConstant(0, true),
6403                              Flag);
6404
6405   Chain = DAG.getCopyFromReg(Chain, dl, X86StackPtr, SPTy).getValue(1);
6406
6407   SDValue Ops1[2] = { Chain.getValue(0), Chain };
6408   return DAG.getMergeValues(Ops1, 2, dl);
6409 }
6410
6411 SDValue
6412 X86TargetLowering::EmitTargetCodeForMemset(SelectionDAG &DAG, DebugLoc dl,
6413                                            SDValue Chain,
6414                                            SDValue Dst, SDValue Src,
6415                                            SDValue Size, unsigned Align,
6416                                            const Value *DstSV,
6417                                            uint64_t DstSVOff) {
6418   ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
6419
6420   // If not DWORD aligned or size is more than the threshold, call the library.
6421   // The libc version is likely to be faster for these cases. It can use the
6422   // address value and run time information about the CPU.
6423   if ((Align & 3) != 0 ||
6424       !ConstantSize ||
6425       ConstantSize->getZExtValue() >
6426         getSubtarget()->getMaxInlineSizeThreshold()) {
6427     SDValue InFlag(0, 0);
6428
6429     // Check to see if there is a specialized entry-point for memory zeroing.
6430     ConstantSDNode *V = dyn_cast<ConstantSDNode>(Src);
6431
6432     if (const char *bzeroEntry =  V &&
6433         V->isNullValue() ? Subtarget->getBZeroEntry() : 0) {
6434       EVT IntPtr = getPointerTy();
6435       const Type *IntPtrTy = TD->getIntPtrType(*DAG.getContext());
6436       TargetLowering::ArgListTy Args;
6437       TargetLowering::ArgListEntry Entry;
6438       Entry.Node = Dst;
6439       Entry.Ty = IntPtrTy;
6440       Args.push_back(Entry);
6441       Entry.Node = Size;
6442       Args.push_back(Entry);
6443       std::pair<SDValue,SDValue> CallResult =
6444         LowerCallTo(Chain, Type::getVoidTy(*DAG.getContext()),
6445                     false, false, false, false,
6446                     0, CallingConv::C, false, /*isReturnValueUsed=*/false,
6447                     DAG.getExternalSymbol(bzeroEntry, IntPtr), Args, DAG, dl,
6448                     DAG.GetOrdering(Chain.getNode()));
6449       return CallResult.second;
6450     }
6451
6452     // Otherwise have the target-independent code call memset.
6453     return SDValue();
6454   }
6455
6456   uint64_t SizeVal = ConstantSize->getZExtValue();
6457   SDValue InFlag(0, 0);
6458   EVT AVT;
6459   SDValue Count;
6460   ConstantSDNode *ValC = dyn_cast<ConstantSDNode>(Src);
6461   unsigned BytesLeft = 0;
6462   bool TwoRepStos = false;
6463   if (ValC) {
6464     unsigned ValReg;
6465     uint64_t Val = ValC->getZExtValue() & 255;
6466
6467     // If the value is a constant, then we can potentially use larger sets.
6468     switch (Align & 3) {
6469     case 2:   // WORD aligned
6470       AVT = MVT::i16;
6471       ValReg = X86::AX;
6472       Val = (Val << 8) | Val;
6473       break;
6474     case 0:  // DWORD aligned
6475       AVT = MVT::i32;
6476       ValReg = X86::EAX;
6477       Val = (Val << 8)  | Val;
6478       Val = (Val << 16) | Val;
6479       if (Subtarget->is64Bit() && ((Align & 0x7) == 0)) {  // QWORD aligned
6480         AVT = MVT::i64;
6481         ValReg = X86::RAX;
6482         Val = (Val << 32) | Val;
6483       }
6484       break;
6485     default:  // Byte aligned
6486       AVT = MVT::i8;
6487       ValReg = X86::AL;
6488       Count = DAG.getIntPtrConstant(SizeVal);
6489       break;
6490     }
6491
6492     if (AVT.bitsGT(MVT::i8)) {
6493       unsigned UBytes = AVT.getSizeInBits() / 8;
6494       Count = DAG.getIntPtrConstant(SizeVal / UBytes);
6495       BytesLeft = SizeVal % UBytes;
6496     }
6497
6498     Chain  = DAG.getCopyToReg(Chain, dl, ValReg, DAG.getConstant(Val, AVT),
6499                               InFlag);
6500     InFlag = Chain.getValue(1);
6501   } else {
6502     AVT = MVT::i8;
6503     Count  = DAG.getIntPtrConstant(SizeVal);
6504     Chain  = DAG.getCopyToReg(Chain, dl, X86::AL, Src, InFlag);
6505     InFlag = Chain.getValue(1);
6506   }
6507
6508   Chain  = DAG.getCopyToReg(Chain, dl, Subtarget->is64Bit() ? X86::RCX :
6509                                                               X86::ECX,
6510                             Count, InFlag);
6511   InFlag = Chain.getValue(1);
6512   Chain  = DAG.getCopyToReg(Chain, dl, Subtarget->is64Bit() ? X86::RDI :
6513                                                               X86::EDI,
6514                             Dst, InFlag);
6515   InFlag = Chain.getValue(1);
6516
6517   SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag);
6518   SDValue Ops[] = { Chain, DAG.getValueType(AVT), InFlag };
6519   Chain = DAG.getNode(X86ISD::REP_STOS, dl, Tys, Ops, array_lengthof(Ops));
6520
6521   if (TwoRepStos) {
6522     InFlag = Chain.getValue(1);
6523     Count  = Size;
6524     EVT CVT = Count.getValueType();
6525     SDValue Left = DAG.getNode(ISD::AND, dl, CVT, Count,
6526                                DAG.getConstant((AVT == MVT::i64) ? 7 : 3, CVT));
6527     Chain  = DAG.getCopyToReg(Chain, dl, (CVT == MVT::i64) ? X86::RCX :
6528                                                              X86::ECX,
6529                               Left, InFlag);
6530     InFlag = Chain.getValue(1);
6531     Tys = DAG.getVTList(MVT::Other, MVT::Flag);
6532     SDValue Ops[] = { Chain, DAG.getValueType(MVT::i8), InFlag };
6533     Chain = DAG.getNode(X86ISD::REP_STOS, dl, Tys, Ops, array_lengthof(Ops));
6534   } else if (BytesLeft) {
6535     // Handle the last 1 - 7 bytes.
6536     unsigned Offset = SizeVal - BytesLeft;
6537     EVT AddrVT = Dst.getValueType();
6538     EVT SizeVT = Size.getValueType();
6539
6540     Chain = DAG.getMemset(Chain, dl,
6541                           DAG.getNode(ISD::ADD, dl, AddrVT, Dst,
6542                                       DAG.getConstant(Offset, AddrVT)),
6543                           Src,
6544                           DAG.getConstant(BytesLeft, SizeVT),
6545                           Align, DstSV, DstSVOff + Offset);
6546   }
6547
6548   // TODO: Use a Tokenfactor, as in memcpy, instead of a single chain.
6549   return Chain;
6550 }
6551
6552 SDValue
6553 X86TargetLowering::EmitTargetCodeForMemcpy(SelectionDAG &DAG, DebugLoc dl,
6554                                       SDValue Chain, SDValue Dst, SDValue Src,
6555                                       SDValue Size, unsigned Align,
6556                                       bool AlwaysInline,
6557                                       const Value *DstSV, uint64_t DstSVOff,
6558                                       const Value *SrcSV, uint64_t SrcSVOff) {
6559   // This requires the copy size to be a constant, preferrably
6560   // within a subtarget-specific limit.
6561   ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
6562   if (!ConstantSize)
6563     return SDValue();
6564   uint64_t SizeVal = ConstantSize->getZExtValue();
6565   if (!AlwaysInline && SizeVal > getSubtarget()->getMaxInlineSizeThreshold())
6566     return SDValue();
6567
6568   /// If not DWORD aligned, call the library.
6569   if ((Align & 3) != 0)
6570     return SDValue();
6571
6572   // DWORD aligned
6573   EVT AVT = MVT::i32;
6574   if (Subtarget->is64Bit() && ((Align & 0x7) == 0))  // QWORD aligned
6575     AVT = MVT::i64;
6576
6577   unsigned UBytes = AVT.getSizeInBits() / 8;
6578   unsigned CountVal = SizeVal / UBytes;
6579   SDValue Count = DAG.getIntPtrConstant(CountVal);
6580   unsigned BytesLeft = SizeVal % UBytes;
6581
6582   SDValue InFlag(0, 0);
6583   Chain  = DAG.getCopyToReg(Chain, dl, Subtarget->is64Bit() ? X86::RCX :
6584                                                               X86::ECX,
6585                             Count, InFlag);
6586   InFlag = Chain.getValue(1);
6587   Chain  = DAG.getCopyToReg(Chain, dl, Subtarget->is64Bit() ? X86::RDI :
6588                                                              X86::EDI,
6589                             Dst, InFlag);
6590   InFlag = Chain.getValue(1);
6591   Chain  = DAG.getCopyToReg(Chain, dl, Subtarget->is64Bit() ? X86::RSI :
6592                                                               X86::ESI,
6593                             Src, InFlag);
6594   InFlag = Chain.getValue(1);
6595
6596   SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag);
6597   SDValue Ops[] = { Chain, DAG.getValueType(AVT), InFlag };
6598   SDValue RepMovs = DAG.getNode(X86ISD::REP_MOVS, dl, Tys, Ops,
6599                                 array_lengthof(Ops));
6600
6601   SmallVector<SDValue, 4> Results;
6602   Results.push_back(RepMovs);
6603   if (BytesLeft) {
6604     // Handle the last 1 - 7 bytes.
6605     unsigned Offset = SizeVal - BytesLeft;
6606     EVT DstVT = Dst.getValueType();
6607     EVT SrcVT = Src.getValueType();
6608     EVT SizeVT = Size.getValueType();
6609     Results.push_back(DAG.getMemcpy(Chain, dl,
6610                                     DAG.getNode(ISD::ADD, dl, DstVT, Dst,
6611                                                 DAG.getConstant(Offset, DstVT)),
6612                                     DAG.getNode(ISD::ADD, dl, SrcVT, Src,
6613                                                 DAG.getConstant(Offset, SrcVT)),
6614                                     DAG.getConstant(BytesLeft, SizeVT),
6615                                     Align, AlwaysInline,
6616                                     DstSV, DstSVOff + Offset,
6617                                     SrcSV, SrcSVOff + Offset));
6618   }
6619
6620   return DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
6621                      &Results[0], Results.size());
6622 }
6623
6624 SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) {
6625   const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
6626   DebugLoc dl = Op.getDebugLoc();
6627
6628   if (!Subtarget->is64Bit()) {
6629     // vastart just stores the address of the VarArgsFrameIndex slot into the
6630     // memory location argument.
6631     SDValue FR = DAG.getFrameIndex(VarArgsFrameIndex, getPointerTy());
6632     return DAG.getStore(Op.getOperand(0), dl, FR, Op.getOperand(1), SV, 0,
6633                         false, false, 0);
6634   }
6635
6636   // __va_list_tag:
6637   //   gp_offset         (0 - 6 * 8)
6638   //   fp_offset         (48 - 48 + 8 * 16)
6639   //   overflow_arg_area (point to parameters coming in memory).
6640   //   reg_save_area
6641   SmallVector<SDValue, 8> MemOps;
6642   SDValue FIN = Op.getOperand(1);
6643   // Store gp_offset
6644   SDValue Store = DAG.getStore(Op.getOperand(0), dl,
6645                                DAG.getConstant(VarArgsGPOffset, MVT::i32),
6646                                FIN, SV, 0, false, false, 0);
6647   MemOps.push_back(Store);
6648
6649   // Store fp_offset
6650   FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(),
6651                     FIN, DAG.getIntPtrConstant(4));
6652   Store = DAG.getStore(Op.getOperand(0), dl,
6653                        DAG.getConstant(VarArgsFPOffset, MVT::i32),
6654                        FIN, SV, 0, false, false, 0);
6655   MemOps.push_back(Store);
6656
6657   // Store ptr to overflow_arg_area
6658   FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(),
6659                     FIN, DAG.getIntPtrConstant(4));
6660   SDValue OVFIN = DAG.getFrameIndex(VarArgsFrameIndex, getPointerTy());
6661   Store = DAG.getStore(Op.getOperand(0), dl, OVFIN, FIN, SV, 0,
6662                        false, false, 0);
6663   MemOps.push_back(Store);
6664
6665   // Store ptr to reg_save_area.
6666   FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(),
6667                     FIN, DAG.getIntPtrConstant(8));
6668   SDValue RSFIN = DAG.getFrameIndex(RegSaveFrameIndex, getPointerTy());
6669   Store = DAG.getStore(Op.getOperand(0), dl, RSFIN, FIN, SV, 0,
6670                        false, false, 0);
6671   MemOps.push_back(Store);
6672   return DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
6673                      &MemOps[0], MemOps.size());
6674 }
6675
6676 SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) {
6677   // X86-64 va_list is a struct { i32, i32, i8*, i8* }.
6678   assert(Subtarget->is64Bit() && "This code only handles 64-bit va_arg!");
6679   SDValue Chain = Op.getOperand(0);
6680   SDValue SrcPtr = Op.getOperand(1);
6681   SDValue SrcSV = Op.getOperand(2);
6682
6683   llvm_report_error("VAArgInst is not yet implemented for x86-64!");
6684   return SDValue();
6685 }
6686
6687 SDValue X86TargetLowering::LowerVACOPY(SDValue Op, SelectionDAG &DAG) {
6688   // X86-64 va_list is a struct { i32, i32, i8*, i8* }.
6689   assert(Subtarget->is64Bit() && "This code only handles 64-bit va_copy!");
6690   SDValue Chain = Op.getOperand(0);
6691   SDValue DstPtr = Op.getOperand(1);
6692   SDValue SrcPtr = Op.getOperand(2);
6693   const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
6694   const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
6695   DebugLoc dl = Op.getDebugLoc();
6696
6697   return DAG.getMemcpy(Chain, dl, DstPtr, SrcPtr,
6698                        DAG.getIntPtrConstant(24), 8, false,
6699                        DstSV, 0, SrcSV, 0);
6700 }
6701
6702 SDValue
6703 X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) {
6704   DebugLoc dl = Op.getDebugLoc();
6705   unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
6706   switch (IntNo) {
6707   default: return SDValue();    // Don't custom lower most intrinsics.
6708   // Comparison intrinsics.
6709   case Intrinsic::x86_sse_comieq_ss:
6710   case Intrinsic::x86_sse_comilt_ss:
6711   case Intrinsic::x86_sse_comile_ss:
6712   case Intrinsic::x86_sse_comigt_ss:
6713   case Intrinsic::x86_sse_comige_ss:
6714   case Intrinsic::x86_sse_comineq_ss:
6715   case Intrinsic::x86_sse_ucomieq_ss:
6716   case Intrinsic::x86_sse_ucomilt_ss:
6717   case Intrinsic::x86_sse_ucomile_ss:
6718   case Intrinsic::x86_sse_ucomigt_ss:
6719   case Intrinsic::x86_sse_ucomige_ss:
6720   case Intrinsic::x86_sse_ucomineq_ss:
6721   case Intrinsic::x86_sse2_comieq_sd:
6722   case Intrinsic::x86_sse2_comilt_sd:
6723   case Intrinsic::x86_sse2_comile_sd:
6724   case Intrinsic::x86_sse2_comigt_sd:
6725   case Intrinsic::x86_sse2_comige_sd:
6726   case Intrinsic::x86_sse2_comineq_sd:
6727   case Intrinsic::x86_sse2_ucomieq_sd:
6728   case Intrinsic::x86_sse2_ucomilt_sd:
6729   case Intrinsic::x86_sse2_ucomile_sd:
6730   case Intrinsic::x86_sse2_ucomigt_sd:
6731   case Intrinsic::x86_sse2_ucomige_sd:
6732   case Intrinsic::x86_sse2_ucomineq_sd: {
6733     unsigned Opc = 0;
6734     ISD::CondCode CC = ISD::SETCC_INVALID;
6735     switch (IntNo) {
6736     default: break;
6737     case Intrinsic::x86_sse_comieq_ss:
6738     case Intrinsic::x86_sse2_comieq_sd:
6739       Opc = X86ISD::COMI;
6740       CC = ISD::SETEQ;
6741       break;
6742     case Intrinsic::x86_sse_comilt_ss:
6743     case Intrinsic::x86_sse2_comilt_sd:
6744       Opc = X86ISD::COMI;
6745       CC = ISD::SETLT;
6746       break;
6747     case Intrinsic::x86_sse_comile_ss:
6748     case Intrinsic::x86_sse2_comile_sd:
6749       Opc = X86ISD::COMI;
6750       CC = ISD::SETLE;
6751       break;
6752     case Intrinsic::x86_sse_comigt_ss:
6753     case Intrinsic::x86_sse2_comigt_sd:
6754       Opc = X86ISD::COMI;
6755       CC = ISD::SETGT;
6756       break;
6757     case Intrinsic::x86_sse_comige_ss:
6758     case Intrinsic::x86_sse2_comige_sd:
6759       Opc = X86ISD::COMI;
6760       CC = ISD::SETGE;
6761       break;
6762     case Intrinsic::x86_sse_comineq_ss:
6763     case Intrinsic::x86_sse2_comineq_sd:
6764       Opc = X86ISD::COMI;
6765       CC = ISD::SETNE;
6766       break;
6767     case Intrinsic::x86_sse_ucomieq_ss:
6768     case Intrinsic::x86_sse2_ucomieq_sd:
6769       Opc = X86ISD::UCOMI;
6770       CC = ISD::SETEQ;
6771       break;
6772     case Intrinsic::x86_sse_ucomilt_ss:
6773     case Intrinsic::x86_sse2_ucomilt_sd:
6774       Opc = X86ISD::UCOMI;
6775       CC = ISD::SETLT;
6776       break;
6777     case Intrinsic::x86_sse_ucomile_ss:
6778     case Intrinsic::x86_sse2_ucomile_sd:
6779       Opc = X86ISD::UCOMI;
6780       CC = ISD::SETLE;
6781       break;
6782     case Intrinsic::x86_sse_ucomigt_ss:
6783     case Intrinsic::x86_sse2_ucomigt_sd:
6784       Opc = X86ISD::UCOMI;
6785       CC = ISD::SETGT;
6786       break;
6787     case Intrinsic::x86_sse_ucomige_ss:
6788     case Intrinsic::x86_sse2_ucomige_sd:
6789       Opc = X86ISD::UCOMI;
6790       CC = ISD::SETGE;
6791       break;
6792     case Intrinsic::x86_sse_ucomineq_ss:
6793     case Intrinsic::x86_sse2_ucomineq_sd:
6794       Opc = X86ISD::UCOMI;
6795       CC = ISD::SETNE;
6796       break;
6797     }
6798
6799     SDValue LHS = Op.getOperand(1);
6800     SDValue RHS = Op.getOperand(2);
6801     unsigned X86CC = TranslateX86CC(CC, true, LHS, RHS, DAG);
6802     assert(X86CC != X86::COND_INVALID && "Unexpected illegal condition!");
6803     SDValue Cond = DAG.getNode(Opc, dl, MVT::i32, LHS, RHS);
6804     SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
6805                                 DAG.getConstant(X86CC, MVT::i8), Cond);
6806     return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
6807   }
6808   // ptest intrinsics. The intrinsic these come from are designed to return
6809   // an integer value, not just an instruction so lower it to the ptest
6810   // pattern and a setcc for the result.
6811   case Intrinsic::x86_sse41_ptestz:
6812   case Intrinsic::x86_sse41_ptestc:
6813   case Intrinsic::x86_sse41_ptestnzc:{
6814     unsigned X86CC = 0;
6815     switch (IntNo) {
6816     default: llvm_unreachable("Bad fallthrough in Intrinsic lowering.");
6817     case Intrinsic::x86_sse41_ptestz:
6818       // ZF = 1
6819       X86CC = X86::COND_E;
6820       break;
6821     case Intrinsic::x86_sse41_ptestc:
6822       // CF = 1
6823       X86CC = X86::COND_B;
6824       break;
6825     case Intrinsic::x86_sse41_ptestnzc:
6826       // ZF and CF = 0
6827       X86CC = X86::COND_A;
6828       break;
6829     }
6830
6831     SDValue LHS = Op.getOperand(1);
6832     SDValue RHS = Op.getOperand(2);
6833     SDValue Test = DAG.getNode(X86ISD::PTEST, dl, MVT::i32, LHS, RHS);
6834     SDValue CC = DAG.getConstant(X86CC, MVT::i8);
6835     SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, CC, Test);
6836     return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
6837   }
6838
6839   // Fix vector shift instructions where the last operand is a non-immediate
6840   // i32 value.
6841   case Intrinsic::x86_sse2_pslli_w:
6842   case Intrinsic::x86_sse2_pslli_d:
6843   case Intrinsic::x86_sse2_pslli_q:
6844   case Intrinsic::x86_sse2_psrli_w:
6845   case Intrinsic::x86_sse2_psrli_d:
6846   case Intrinsic::x86_sse2_psrli_q:
6847   case Intrinsic::x86_sse2_psrai_w:
6848   case Intrinsic::x86_sse2_psrai_d:
6849   case Intrinsic::x86_mmx_pslli_w:
6850   case Intrinsic::x86_mmx_pslli_d:
6851   case Intrinsic::x86_mmx_pslli_q:
6852   case Intrinsic::x86_mmx_psrli_w:
6853   case Intrinsic::x86_mmx_psrli_d:
6854   case Intrinsic::x86_mmx_psrli_q:
6855   case Intrinsic::x86_mmx_psrai_w:
6856   case Intrinsic::x86_mmx_psrai_d: {
6857     SDValue ShAmt = Op.getOperand(2);
6858     if (isa<ConstantSDNode>(ShAmt))
6859       return SDValue();
6860
6861     unsigned NewIntNo = 0;
6862     EVT ShAmtVT = MVT::v4i32;
6863     switch (IntNo) {
6864     case Intrinsic::x86_sse2_pslli_w:
6865       NewIntNo = Intrinsic::x86_sse2_psll_w;
6866       break;
6867     case Intrinsic::x86_sse2_pslli_d:
6868       NewIntNo = Intrinsic::x86_sse2_psll_d;
6869       break;
6870     case Intrinsic::x86_sse2_pslli_q:
6871       NewIntNo = Intrinsic::x86_sse2_psll_q;
6872       break;
6873     case Intrinsic::x86_sse2_psrli_w:
6874       NewIntNo = Intrinsic::x86_sse2_psrl_w;
6875       break;
6876     case Intrinsic::x86_sse2_psrli_d:
6877       NewIntNo = Intrinsic::x86_sse2_psrl_d;
6878       break;
6879     case Intrinsic::x86_sse2_psrli_q:
6880       NewIntNo = Intrinsic::x86_sse2_psrl_q;
6881       break;
6882     case Intrinsic::x86_sse2_psrai_w:
6883       NewIntNo = Intrinsic::x86_sse2_psra_w;
6884       break;
6885     case Intrinsic::x86_sse2_psrai_d:
6886       NewIntNo = Intrinsic::x86_sse2_psra_d;
6887       break;
6888     default: {
6889       ShAmtVT = MVT::v2i32;
6890       switch (IntNo) {
6891       case Intrinsic::x86_mmx_pslli_w:
6892         NewIntNo = Intrinsic::x86_mmx_psll_w;
6893         break;
6894       case Intrinsic::x86_mmx_pslli_d:
6895         NewIntNo = Intrinsic::x86_mmx_psll_d;
6896         break;
6897       case Intrinsic::x86_mmx_pslli_q:
6898         NewIntNo = Intrinsic::x86_mmx_psll_q;
6899         break;
6900       case Intrinsic::x86_mmx_psrli_w:
6901         NewIntNo = Intrinsic::x86_mmx_psrl_w;
6902         break;
6903       case Intrinsic::x86_mmx_psrli_d:
6904         NewIntNo = Intrinsic::x86_mmx_psrl_d;
6905         break;
6906       case Intrinsic::x86_mmx_psrli_q:
6907         NewIntNo = Intrinsic::x86_mmx_psrl_q;
6908         break;
6909       case Intrinsic::x86_mmx_psrai_w:
6910         NewIntNo = Intrinsic::x86_mmx_psra_w;
6911         break;
6912       case Intrinsic::x86_mmx_psrai_d:
6913         NewIntNo = Intrinsic::x86_mmx_psra_d;
6914         break;
6915       default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
6916       }
6917       break;
6918     }
6919     }
6920
6921     // The vector shift intrinsics with scalars uses 32b shift amounts but
6922     // the sse2/mmx shift instructions reads 64 bits. Set the upper 32 bits
6923     // to be zero.
6924     SDValue ShOps[4];
6925     ShOps[0] = ShAmt;
6926     ShOps[1] = DAG.getConstant(0, MVT::i32);
6927     if (ShAmtVT == MVT::v4i32) {
6928       ShOps[2] = DAG.getUNDEF(MVT::i32);
6929       ShOps[3] = DAG.getUNDEF(MVT::i32);
6930       ShAmt =  DAG.getNode(ISD::BUILD_VECTOR, dl, ShAmtVT, &ShOps[0], 4);
6931     } else {
6932       ShAmt =  DAG.getNode(ISD::BUILD_VECTOR, dl, ShAmtVT, &ShOps[0], 2);
6933     }
6934
6935     EVT VT = Op.getValueType();
6936     ShAmt = DAG.getNode(ISD::BIT_CONVERT, dl, VT, ShAmt);
6937     return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
6938                        DAG.getConstant(NewIntNo, MVT::i32),
6939                        Op.getOperand(1), ShAmt);
6940   }
6941   }
6942 }
6943
6944 SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) {
6945   unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
6946   DebugLoc dl = Op.getDebugLoc();
6947
6948   if (Depth > 0) {
6949     SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
6950     SDValue Offset =
6951       DAG.getConstant(TD->getPointerSize(),
6952                       Subtarget->is64Bit() ? MVT::i64 : MVT::i32);
6953     return DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(),
6954                        DAG.getNode(ISD::ADD, dl, getPointerTy(),
6955                                    FrameAddr, Offset),
6956                        NULL, 0, false, false, 0);
6957   }
6958
6959   // Just load the return address.
6960   SDValue RetAddrFI = getReturnAddressFrameIndex(DAG);
6961   return DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(),
6962                      RetAddrFI, NULL, 0, false, false, 0);
6963 }
6964
6965 SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) {
6966   MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
6967   MFI->setFrameAddressIsTaken(true);
6968   EVT VT = Op.getValueType();
6969   DebugLoc dl = Op.getDebugLoc();  // FIXME probably not meaningful
6970   unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
6971   unsigned FrameReg = Subtarget->is64Bit() ? X86::RBP : X86::EBP;
6972   SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
6973   while (Depth--)
6974     FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr, NULL, 0,
6975                             false, false, 0);
6976   return FrameAddr;
6977 }
6978
6979 SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,
6980                                                      SelectionDAG &DAG) {
6981   return DAG.getIntPtrConstant(2*TD->getPointerSize());
6982 }
6983
6984 SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG)
6985 {
6986   MachineFunction &MF = DAG.getMachineFunction();
6987   SDValue Chain     = Op.getOperand(0);
6988   SDValue Offset    = Op.getOperand(1);
6989   SDValue Handler   = Op.getOperand(2);
6990   DebugLoc dl       = Op.getDebugLoc();
6991
6992   SDValue Frame = DAG.getRegister(Subtarget->is64Bit() ? X86::RBP : X86::EBP,
6993                                   getPointerTy());
6994   unsigned StoreAddrReg = (Subtarget->is64Bit() ? X86::RCX : X86::ECX);
6995
6996   SDValue StoreAddr = DAG.getNode(ISD::SUB, dl, getPointerTy(), Frame,
6997                                   DAG.getIntPtrConstant(-TD->getPointerSize()));
6998   StoreAddr = DAG.getNode(ISD::ADD, dl, getPointerTy(), StoreAddr, Offset);
6999   Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, NULL, 0, false, false, 0);
7000   Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr);
7001   MF.getRegInfo().addLiveOut(StoreAddrReg);
7002
7003   return DAG.getNode(X86ISD::EH_RETURN, dl,
7004                      MVT::Other,
7005                      Chain, DAG.getRegister(StoreAddrReg, getPointerTy()));
7006 }
7007
7008 SDValue X86TargetLowering::LowerTRAMPOLINE(SDValue Op,
7009                                              SelectionDAG &DAG) {
7010   SDValue Root = Op.getOperand(0);
7011   SDValue Trmp = Op.getOperand(1); // trampoline
7012   SDValue FPtr = Op.getOperand(2); // nested function
7013   SDValue Nest = Op.getOperand(3); // 'nest' parameter value
7014   DebugLoc dl  = Op.getDebugLoc();
7015
7016   const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
7017
7018   if (Subtarget->is64Bit()) {
7019     SDValue OutChains[6];
7020
7021     // Large code-model.
7022     const unsigned char JMP64r  = 0xFF; // 64-bit jmp through register opcode.
7023     const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode.
7024
7025     const unsigned char N86R10 = RegInfo->getX86RegNum(X86::R10);
7026     const unsigned char N86R11 = RegInfo->getX86RegNum(X86::R11);
7027
7028     const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix
7029
7030     // Load the pointer to the nested function into R11.
7031     unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11
7032     SDValue Addr = Trmp;
7033     OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16),
7034                                 Addr, TrmpAddr, 0, false, false, 0);
7035
7036     Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
7037                        DAG.getConstant(2, MVT::i64));
7038     OutChains[1] = DAG.getStore(Root, dl, FPtr, Addr, TrmpAddr, 2,
7039                                 false, false, 2);
7040
7041     // Load the 'nest' parameter value into R10.
7042     // R10 is specified in X86CallingConv.td
7043     OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10
7044     Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
7045                        DAG.getConstant(10, MVT::i64));
7046     OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16),
7047                                 Addr, TrmpAddr, 10, false, false, 0);
7048
7049     Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
7050                        DAG.getConstant(12, MVT::i64));
7051     OutChains[3] = DAG.getStore(Root, dl, Nest, Addr, TrmpAddr, 12,
7052                                 false, false, 2);
7053
7054     // Jump to the nested function.
7055     OpCode = (JMP64r << 8) | REX_WB; // jmpq *...
7056     Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
7057                        DAG.getConstant(20, MVT::i64));
7058     OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16),
7059                                 Addr, TrmpAddr, 20, false, false, 0);
7060
7061     unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11
7062     Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
7063                        DAG.getConstant(22, MVT::i64));
7064     OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, MVT::i8), Addr,
7065                                 TrmpAddr, 22, false, false, 0);
7066
7067     SDValue Ops[] =
7068       { Trmp, DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains, 6) };
7069     return DAG.getMergeValues(Ops, 2, dl);
7070   } else {
7071     const Function *Func =
7072       cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue());
7073     CallingConv::ID CC = Func->getCallingConv();
7074     unsigned NestReg;
7075
7076     switch (CC) {
7077     default:
7078       llvm_unreachable("Unsupported calling convention");
7079     case CallingConv::C:
7080     case CallingConv::X86_StdCall: {
7081       // Pass 'nest' parameter in ECX.
7082       // Must be kept in sync with X86CallingConv.td
7083       NestReg = X86::ECX;
7084
7085       // Check that ECX wasn't needed by an 'inreg' parameter.
7086       const FunctionType *FTy = Func->getFunctionType();
7087       const AttrListPtr &Attrs = Func->getAttributes();
7088
7089       if (!Attrs.isEmpty() && !Func->isVarArg()) {
7090         unsigned InRegCount = 0;
7091         unsigned Idx = 1;
7092
7093         for (FunctionType::param_iterator I = FTy->param_begin(),
7094              E = FTy->param_end(); I != E; ++I, ++Idx)
7095           if (Attrs.paramHasAttr(Idx, Attribute::InReg))
7096             // FIXME: should only count parameters that are lowered to integers.
7097             InRegCount += (TD->getTypeSizeInBits(*I) + 31) / 32;
7098
7099         if (InRegCount > 2) {
7100           llvm_report_error("Nest register in use - reduce number of inreg parameters!");
7101         }
7102       }
7103       break;
7104     }
7105     case CallingConv::X86_FastCall:
7106     case CallingConv::Fast:
7107       // Pass 'nest' parameter in EAX.
7108       // Must be kept in sync with X86CallingConv.td
7109       NestReg = X86::EAX;
7110       break;
7111     }
7112
7113     SDValue OutChains[4];
7114     SDValue Addr, Disp;
7115
7116     Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
7117                        DAG.getConstant(10, MVT::i32));
7118     Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr);
7119
7120     // This is storing the opcode for MOV32ri.
7121     const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte.
7122     const unsigned char N86Reg = RegInfo->getX86RegNum(NestReg);
7123     OutChains[0] = DAG.getStore(Root, dl,
7124                                 DAG.getConstant(MOV32ri|N86Reg, MVT::i8),
7125                                 Trmp, TrmpAddr, 0, false, false, 0);
7126
7127     Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
7128                        DAG.getConstant(1, MVT::i32));
7129     OutChains[1] = DAG.getStore(Root, dl, Nest, Addr, TrmpAddr, 1,
7130                                 false, false, 1);
7131
7132     const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode.
7133     Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
7134                        DAG.getConstant(5, MVT::i32));
7135     OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(JMP, MVT::i8), Addr,
7136                                 TrmpAddr, 5, false, false, 1);
7137
7138     Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
7139                        DAG.getConstant(6, MVT::i32));
7140     OutChains[3] = DAG.getStore(Root, dl, Disp, Addr, TrmpAddr, 6,
7141                                 false, false, 1);
7142
7143     SDValue Ops[] =
7144       { Trmp, DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains, 4) };
7145     return DAG.getMergeValues(Ops, 2, dl);
7146   }
7147 }
7148
7149 SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op, SelectionDAG &DAG) {
7150   /*
7151    The rounding mode is in bits 11:10 of FPSR, and has the following
7152    settings:
7153      00 Round to nearest
7154      01 Round to -inf
7155      10 Round to +inf
7156      11 Round to 0
7157
7158   FLT_ROUNDS, on the other hand, expects the following:
7159     -1 Undefined
7160      0 Round to 0
7161      1 Round to nearest
7162      2 Round to +inf
7163      3 Round to -inf
7164
7165   To perform the conversion, we do:
7166     (((((FPSR & 0x800) >> 11) | ((FPSR & 0x400) >> 9)) + 1) & 3)
7167   */
7168
7169   MachineFunction &MF = DAG.getMachineFunction();
7170   const TargetMachine &TM = MF.getTarget();
7171   const TargetFrameInfo &TFI = *TM.getFrameInfo();
7172   unsigned StackAlignment = TFI.getStackAlignment();
7173   EVT VT = Op.getValueType();
7174   DebugLoc dl = Op.getDebugLoc();
7175
7176   // Save FP Control Word to stack slot
7177   int SSFI = MF.getFrameInfo()->CreateStackObject(2, StackAlignment, false);
7178   SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
7179
7180   SDValue Chain = DAG.getNode(X86ISD::FNSTCW16m, dl, MVT::Other,
7181                               DAG.getEntryNode(), StackSlot);
7182
7183   // Load FP Control Word from stack slot
7184   SDValue CWD = DAG.getLoad(MVT::i16, dl, Chain, StackSlot, NULL, 0,
7185                             false, false, 0);
7186
7187   // Transform as necessary
7188   SDValue CWD1 =
7189     DAG.getNode(ISD::SRL, dl, MVT::i16,
7190                 DAG.getNode(ISD::AND, dl, MVT::i16,
7191                             CWD, DAG.getConstant(0x800, MVT::i16)),
7192                 DAG.getConstant(11, MVT::i8));
7193   SDValue CWD2 =
7194     DAG.getNode(ISD::SRL, dl, MVT::i16,
7195                 DAG.getNode(ISD::AND, dl, MVT::i16,
7196                             CWD, DAG.getConstant(0x400, MVT::i16)),
7197                 DAG.getConstant(9, MVT::i8));
7198
7199   SDValue RetVal =
7200     DAG.getNode(ISD::AND, dl, MVT::i16,
7201                 DAG.getNode(ISD::ADD, dl, MVT::i16,
7202                             DAG.getNode(ISD::OR, dl, MVT::i16, CWD1, CWD2),
7203                             DAG.getConstant(1, MVT::i16)),
7204                 DAG.getConstant(3, MVT::i16));
7205
7206
7207   return DAG.getNode((VT.getSizeInBits() < 16 ?
7208                       ISD::TRUNCATE : ISD::ZERO_EXTEND), dl, VT, RetVal);
7209 }
7210
7211 SDValue X86TargetLowering::LowerCTLZ(SDValue Op, SelectionDAG &DAG) {
7212   EVT VT = Op.getValueType();
7213   EVT OpVT = VT;
7214   unsigned NumBits = VT.getSizeInBits();
7215   DebugLoc dl = Op.getDebugLoc();
7216
7217   Op = Op.getOperand(0);
7218   if (VT == MVT::i8) {
7219     // Zero extend to i32 since there is not an i8 bsr.
7220     OpVT = MVT::i32;
7221     Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
7222   }
7223
7224   // Issue a bsr (scan bits in reverse) which also sets EFLAGS.
7225   SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
7226   Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op);
7227
7228   // If src is zero (i.e. bsr sets ZF), returns NumBits.
7229   SDValue Ops[] = {
7230     Op,
7231     DAG.getConstant(NumBits+NumBits-1, OpVT),
7232     DAG.getConstant(X86::COND_E, MVT::i8),
7233     Op.getValue(1)
7234   };
7235   Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops, array_lengthof(Ops));
7236
7237   // Finally xor with NumBits-1.
7238   Op = DAG.getNode(ISD::XOR, dl, OpVT, Op, DAG.getConstant(NumBits-1, OpVT));
7239
7240   if (VT == MVT::i8)
7241     Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
7242   return Op;
7243 }
7244
7245 SDValue X86TargetLowering::LowerCTTZ(SDValue Op, SelectionDAG &DAG) {
7246   EVT VT = Op.getValueType();
7247   EVT OpVT = VT;
7248   unsigned NumBits = VT.getSizeInBits();
7249   DebugLoc dl = Op.getDebugLoc();
7250
7251   Op = Op.getOperand(0);
7252   if (VT == MVT::i8) {
7253     OpVT = MVT::i32;
7254     Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
7255   }
7256
7257   // Issue a bsf (scan bits forward) which also sets EFLAGS.
7258   SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
7259   Op = DAG.getNode(X86ISD::BSF, dl, VTs, Op);
7260
7261   // If src is zero (i.e. bsf sets ZF), returns NumBits.
7262   SDValue Ops[] = {
7263     Op,
7264     DAG.getConstant(NumBits, OpVT),
7265     DAG.getConstant(X86::COND_E, MVT::i8),
7266     Op.getValue(1)
7267   };
7268   Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops, array_lengthof(Ops));
7269
7270   if (VT == MVT::i8)
7271     Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
7272   return Op;
7273 }
7274
7275 SDValue X86TargetLowering::LowerMUL_V2I64(SDValue Op, SelectionDAG &DAG) {
7276   EVT VT = Op.getValueType();
7277   assert(VT == MVT::v2i64 && "Only know how to lower V2I64 multiply");
7278   DebugLoc dl = Op.getDebugLoc();
7279
7280   //  ulong2 Ahi = __builtin_ia32_psrlqi128( a, 32);
7281   //  ulong2 Bhi = __builtin_ia32_psrlqi128( b, 32);
7282   //  ulong2 AloBlo = __builtin_ia32_pmuludq128( a, b );
7283   //  ulong2 AloBhi = __builtin_ia32_pmuludq128( a, Bhi );
7284   //  ulong2 AhiBlo = __builtin_ia32_pmuludq128( Ahi, b );
7285   //
7286   //  AloBhi = __builtin_ia32_psllqi128( AloBhi, 32 );
7287   //  AhiBlo = __builtin_ia32_psllqi128( AhiBlo, 32 );
7288   //  return AloBlo + AloBhi + AhiBlo;
7289
7290   SDValue A = Op.getOperand(0);
7291   SDValue B = Op.getOperand(1);
7292
7293   SDValue Ahi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
7294                        DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32),
7295                        A, DAG.getConstant(32, MVT::i32));
7296   SDValue Bhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
7297                        DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32),
7298                        B, DAG.getConstant(32, MVT::i32));
7299   SDValue AloBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
7300                        DAG.getConstant(Intrinsic::x86_sse2_pmulu_dq, MVT::i32),
7301                        A, B);
7302   SDValue AloBhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
7303                        DAG.getConstant(Intrinsic::x86_sse2_pmulu_dq, MVT::i32),
7304                        A, Bhi);
7305   SDValue AhiBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
7306                        DAG.getConstant(Intrinsic::x86_sse2_pmulu_dq, MVT::i32),
7307                        Ahi, B);
7308   AloBhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
7309                        DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32),
7310                        AloBhi, DAG.getConstant(32, MVT::i32));
7311   AhiBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
7312                        DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32),
7313                        AhiBlo, DAG.getConstant(32, MVT::i32));
7314   SDValue Res = DAG.getNode(ISD::ADD, dl, VT, AloBlo, AloBhi);
7315   Res = DAG.getNode(ISD::ADD, dl, VT, Res, AhiBlo);
7316   return Res;
7317 }
7318
7319
7320 SDValue X86TargetLowering::LowerXALUO(SDValue Op, SelectionDAG &DAG) {
7321   // Lower the "add/sub/mul with overflow" instruction into a regular ins plus
7322   // a "setcc" instruction that checks the overflow flag. The "brcond" lowering
7323   // looks for this combo and may remove the "setcc" instruction if the "setcc"
7324   // has only one use.
7325   SDNode *N = Op.getNode();
7326   SDValue LHS = N->getOperand(0);
7327   SDValue RHS = N->getOperand(1);
7328   unsigned BaseOp = 0;
7329   unsigned Cond = 0;
7330   DebugLoc dl = Op.getDebugLoc();
7331
7332   switch (Op.getOpcode()) {
7333   default: llvm_unreachable("Unknown ovf instruction!");
7334   case ISD::SADDO:
7335     // A subtract of one will be selected as a INC. Note that INC doesn't
7336     // set CF, so we can't do this for UADDO.
7337     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op))
7338       if (C->getAPIntValue() == 1) {
7339         BaseOp = X86ISD::INC;
7340         Cond = X86::COND_O;
7341         break;
7342       }
7343     BaseOp = X86ISD::ADD;
7344     Cond = X86::COND_O;
7345     break;
7346   case ISD::UADDO:
7347     BaseOp = X86ISD::ADD;
7348     Cond = X86::COND_B;
7349     break;
7350   case ISD::SSUBO:
7351     // A subtract of one will be selected as a DEC. Note that DEC doesn't
7352     // set CF, so we can't do this for USUBO.
7353     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op))
7354       if (C->getAPIntValue() == 1) {
7355         BaseOp = X86ISD::DEC;
7356         Cond = X86::COND_O;
7357         break;
7358       }
7359     BaseOp = X86ISD::SUB;
7360     Cond = X86::COND_O;
7361     break;
7362   case ISD::USUBO:
7363     BaseOp = X86ISD::SUB;
7364     Cond = X86::COND_B;
7365     break;
7366   case ISD::SMULO:
7367     BaseOp = X86ISD::SMUL;
7368     Cond = X86::COND_O;
7369     break;
7370   case ISD::UMULO:
7371     BaseOp = X86ISD::UMUL;
7372     Cond = X86::COND_B;
7373     break;
7374   }
7375
7376   // Also sets EFLAGS.
7377   SDVTList VTs = DAG.getVTList(N->getValueType(0), MVT::i32);
7378   SDValue Sum = DAG.getNode(BaseOp, dl, VTs, LHS, RHS);
7379
7380   SDValue SetCC =
7381     DAG.getNode(X86ISD::SETCC, dl, N->getValueType(1),
7382                 DAG.getConstant(Cond, MVT::i32), SDValue(Sum.getNode(), 1));
7383
7384   DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), SetCC);
7385   return Sum;
7386 }
7387
7388 SDValue X86TargetLowering::LowerCMP_SWAP(SDValue Op, SelectionDAG &DAG) {
7389   EVT T = Op.getValueType();
7390   DebugLoc dl = Op.getDebugLoc();
7391   unsigned Reg = 0;
7392   unsigned size = 0;
7393   switch(T.getSimpleVT().SimpleTy) {
7394   default:
7395     assert(false && "Invalid value type!");
7396   case MVT::i8:  Reg = X86::AL;  size = 1; break;
7397   case MVT::i16: Reg = X86::AX;  size = 2; break;
7398   case MVT::i32: Reg = X86::EAX; size = 4; break;
7399   case MVT::i64:
7400     assert(Subtarget->is64Bit() && "Node not type legal!");
7401     Reg = X86::RAX; size = 8;
7402     break;
7403   }
7404   SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), dl, Reg,
7405                                     Op.getOperand(2), SDValue());
7406   SDValue Ops[] = { cpIn.getValue(0),
7407                     Op.getOperand(1),
7408                     Op.getOperand(3),
7409                     DAG.getTargetConstant(size, MVT::i8),
7410                     cpIn.getValue(1) };
7411   SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag);
7412   SDValue Result = DAG.getNode(X86ISD::LCMPXCHG_DAG, dl, Tys, Ops, 5);
7413   SDValue cpOut =
7414     DAG.getCopyFromReg(Result.getValue(0), dl, Reg, T, Result.getValue(1));
7415   return cpOut;
7416 }
7417
7418 SDValue X86TargetLowering::LowerREADCYCLECOUNTER(SDValue Op,
7419                                                  SelectionDAG &DAG) {
7420   assert(Subtarget->is64Bit() && "Result not type legalized?");
7421   SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag);
7422   SDValue TheChain = Op.getOperand(0);
7423   DebugLoc dl = Op.getDebugLoc();
7424   SDValue rd = DAG.getNode(X86ISD::RDTSC_DAG, dl, Tys, &TheChain, 1);
7425   SDValue rax = DAG.getCopyFromReg(rd, dl, X86::RAX, MVT::i64, rd.getValue(1));
7426   SDValue rdx = DAG.getCopyFromReg(rax.getValue(1), dl, X86::RDX, MVT::i64,
7427                                    rax.getValue(2));
7428   SDValue Tmp = DAG.getNode(ISD::SHL, dl, MVT::i64, rdx,
7429                             DAG.getConstant(32, MVT::i8));
7430   SDValue Ops[] = {
7431     DAG.getNode(ISD::OR, dl, MVT::i64, rax, Tmp),
7432     rdx.getValue(1)
7433   };
7434   return DAG.getMergeValues(Ops, 2, dl);
7435 }
7436
7437 SDValue X86TargetLowering::LowerLOAD_SUB(SDValue Op, SelectionDAG &DAG) {
7438   SDNode *Node = Op.getNode();
7439   DebugLoc dl = Node->getDebugLoc();
7440   EVT T = Node->getValueType(0);
7441   SDValue negOp = DAG.getNode(ISD::SUB, dl, T,
7442                               DAG.getConstant(0, T), Node->getOperand(2));
7443   return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, dl,
7444                        cast<AtomicSDNode>(Node)->getMemoryVT(),
7445                        Node->getOperand(0),
7446                        Node->getOperand(1), negOp,
7447                        cast<AtomicSDNode>(Node)->getSrcValue(),
7448                        cast<AtomicSDNode>(Node)->getAlignment());
7449 }
7450
7451 /// LowerOperation - Provide custom lowering hooks for some operations.
7452 ///
7453 SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) {
7454   switch (Op.getOpcode()) {
7455   default: llvm_unreachable("Should not custom lower this!");
7456   case ISD::ATOMIC_CMP_SWAP:    return LowerCMP_SWAP(Op,DAG);
7457   case ISD::ATOMIC_LOAD_SUB:    return LowerLOAD_SUB(Op,DAG);
7458   case ISD::BUILD_VECTOR:       return LowerBUILD_VECTOR(Op, DAG);
7459   case ISD::CONCAT_VECTORS:     return LowerCONCAT_VECTORS(Op, DAG);
7460   case ISD::VECTOR_SHUFFLE:     return LowerVECTOR_SHUFFLE(Op, DAG);
7461   case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
7462   case ISD::INSERT_VECTOR_ELT:  return LowerINSERT_VECTOR_ELT(Op, DAG);
7463   case ISD::SCALAR_TO_VECTOR:   return LowerSCALAR_TO_VECTOR(Op, DAG);
7464   case ISD::ConstantPool:       return LowerConstantPool(Op, DAG);
7465   case ISD::GlobalAddress:      return LowerGlobalAddress(Op, DAG);
7466   case ISD::GlobalTLSAddress:   return LowerGlobalTLSAddress(Op, DAG);
7467   case ISD::ExternalSymbol:     return LowerExternalSymbol(Op, DAG);
7468   case ISD::BlockAddress:       return LowerBlockAddress(Op, DAG);
7469   case ISD::SHL_PARTS:
7470   case ISD::SRA_PARTS:
7471   case ISD::SRL_PARTS:          return LowerShift(Op, DAG);
7472   case ISD::SINT_TO_FP:         return LowerSINT_TO_FP(Op, DAG);
7473   case ISD::UINT_TO_FP:         return LowerUINT_TO_FP(Op, DAG);
7474   case ISD::FP_TO_SINT:         return LowerFP_TO_SINT(Op, DAG);
7475   case ISD::FP_TO_UINT:         return LowerFP_TO_UINT(Op, DAG);
7476   case ISD::FABS:               return LowerFABS(Op, DAG);
7477   case ISD::FNEG:               return LowerFNEG(Op, DAG);
7478   case ISD::FCOPYSIGN:          return LowerFCOPYSIGN(Op, DAG);
7479   case ISD::SETCC:              return LowerSETCC(Op, DAG);
7480   case ISD::VSETCC:             return LowerVSETCC(Op, DAG);
7481   case ISD::SELECT:             return LowerSELECT(Op, DAG);
7482   case ISD::BRCOND:             return LowerBRCOND(Op, DAG);
7483   case ISD::JumpTable:          return LowerJumpTable(Op, DAG);
7484   case ISD::VASTART:            return LowerVASTART(Op, DAG);
7485   case ISD::VAARG:              return LowerVAARG(Op, DAG);
7486   case ISD::VACOPY:             return LowerVACOPY(Op, DAG);
7487   case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
7488   case ISD::RETURNADDR:         return LowerRETURNADDR(Op, DAG);
7489   case ISD::FRAMEADDR:          return LowerFRAMEADDR(Op, DAG);
7490   case ISD::FRAME_TO_ARGS_OFFSET:
7491                                 return LowerFRAME_TO_ARGS_OFFSET(Op, DAG);
7492   case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
7493   case ISD::EH_RETURN:          return LowerEH_RETURN(Op, DAG);
7494   case ISD::TRAMPOLINE:         return LowerTRAMPOLINE(Op, DAG);
7495   case ISD::FLT_ROUNDS_:        return LowerFLT_ROUNDS_(Op, DAG);
7496   case ISD::CTLZ:               return LowerCTLZ(Op, DAG);
7497   case ISD::CTTZ:               return LowerCTTZ(Op, DAG);
7498   case ISD::MUL:                return LowerMUL_V2I64(Op, DAG);
7499   case ISD::SADDO:
7500   case ISD::UADDO:
7501   case ISD::SSUBO:
7502   case ISD::USUBO:
7503   case ISD::SMULO:
7504   case ISD::UMULO:              return LowerXALUO(Op, DAG);
7505   case ISD::READCYCLECOUNTER:   return LowerREADCYCLECOUNTER(Op, DAG);
7506   }
7507 }
7508
7509 void X86TargetLowering::
7510 ReplaceATOMIC_BINARY_64(SDNode *Node, SmallVectorImpl<SDValue>&Results,
7511                         SelectionDAG &DAG, unsigned NewOp) {
7512   EVT T = Node->getValueType(0);
7513   DebugLoc dl = Node->getDebugLoc();
7514   assert (T == MVT::i64 && "Only know how to expand i64 atomics");
7515
7516   SDValue Chain = Node->getOperand(0);
7517   SDValue In1 = Node->getOperand(1);
7518   SDValue In2L = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
7519                              Node->getOperand(2), DAG.getIntPtrConstant(0));
7520   SDValue In2H = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
7521                              Node->getOperand(2), DAG.getIntPtrConstant(1));
7522   SDValue Ops[] = { Chain, In1, In2L, In2H };
7523   SDVTList Tys = DAG.getVTList(MVT::i32, MVT::i32, MVT::Other);
7524   SDValue Result =
7525     DAG.getMemIntrinsicNode(NewOp, dl, Tys, Ops, 4, MVT::i64,
7526                             cast<MemSDNode>(Node)->getMemOperand());
7527   SDValue OpsF[] = { Result.getValue(0), Result.getValue(1)};
7528   Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, OpsF, 2));
7529   Results.push_back(Result.getValue(2));
7530 }
7531
7532 /// ReplaceNodeResults - Replace a node with an illegal result type
7533 /// with a new node built out of custom code.
7534 void X86TargetLowering::ReplaceNodeResults(SDNode *N,
7535                                            SmallVectorImpl<SDValue>&Results,
7536                                            SelectionDAG &DAG) {
7537   DebugLoc dl = N->getDebugLoc();
7538   switch (N->getOpcode()) {
7539   default:
7540     assert(false && "Do not know how to custom type legalize this operation!");
7541     return;
7542   case ISD::FP_TO_SINT: {
7543     std::pair<SDValue,SDValue> Vals =
7544         FP_TO_INTHelper(SDValue(N, 0), DAG, true);
7545     SDValue FIST = Vals.first, StackSlot = Vals.second;
7546     if (FIST.getNode() != 0) {
7547       EVT VT = N->getValueType(0);
7548       // Return a load from the stack slot.
7549       Results.push_back(DAG.getLoad(VT, dl, FIST, StackSlot, NULL, 0,
7550                                     false, false, 0));
7551     }
7552     return;
7553   }
7554   case ISD::READCYCLECOUNTER: {
7555     SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag);
7556     SDValue TheChain = N->getOperand(0);
7557     SDValue rd = DAG.getNode(X86ISD::RDTSC_DAG, dl, Tys, &TheChain, 1);
7558     SDValue eax = DAG.getCopyFromReg(rd, dl, X86::EAX, MVT::i32,
7559                                      rd.getValue(1));
7560     SDValue edx = DAG.getCopyFromReg(eax.getValue(1), dl, X86::EDX, MVT::i32,
7561                                      eax.getValue(2));
7562     // Use a buildpair to merge the two 32-bit values into a 64-bit one.
7563     SDValue Ops[] = { eax, edx };
7564     Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Ops, 2));
7565     Results.push_back(edx.getValue(1));
7566     return;
7567   }
7568   case ISD::ATOMIC_CMP_SWAP: {
7569     EVT T = N->getValueType(0);
7570     assert (T == MVT::i64 && "Only know how to expand i64 Cmp and Swap");
7571     SDValue cpInL, cpInH;
7572     cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(2),
7573                         DAG.getConstant(0, MVT::i32));
7574     cpInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(2),
7575                         DAG.getConstant(1, MVT::i32));
7576     cpInL = DAG.getCopyToReg(N->getOperand(0), dl, X86::EAX, cpInL, SDValue());
7577     cpInH = DAG.getCopyToReg(cpInL.getValue(0), dl, X86::EDX, cpInH,
7578                              cpInL.getValue(1));
7579     SDValue swapInL, swapInH;
7580     swapInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(3),
7581                           DAG.getConstant(0, MVT::i32));
7582     swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(3),
7583                           DAG.getConstant(1, MVT::i32));
7584     swapInL = DAG.getCopyToReg(cpInH.getValue(0), dl, X86::EBX, swapInL,
7585                                cpInH.getValue(1));
7586     swapInH = DAG.getCopyToReg(swapInL.getValue(0), dl, X86::ECX, swapInH,
7587                                swapInL.getValue(1));
7588     SDValue Ops[] = { swapInH.getValue(0),
7589                       N->getOperand(1),
7590                       swapInH.getValue(1) };
7591     SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag);
7592     SDValue Result = DAG.getNode(X86ISD::LCMPXCHG8_DAG, dl, Tys, Ops, 3);
7593     SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl, X86::EAX,
7594                                         MVT::i32, Result.getValue(1));
7595     SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl, X86::EDX,
7596                                         MVT::i32, cpOutL.getValue(2));
7597     SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)};
7598     Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, OpsF, 2));
7599     Results.push_back(cpOutH.getValue(1));
7600     return;
7601   }
7602   case ISD::ATOMIC_LOAD_ADD:
7603     ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMADD64_DAG);
7604     return;
7605   case ISD::ATOMIC_LOAD_AND:
7606     ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMAND64_DAG);
7607     return;
7608   case ISD::ATOMIC_LOAD_NAND:
7609     ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMNAND64_DAG);
7610     return;
7611   case ISD::ATOMIC_LOAD_OR:
7612     ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMOR64_DAG);
7613     return;
7614   case ISD::ATOMIC_LOAD_SUB:
7615     ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMSUB64_DAG);
7616     return;
7617   case ISD::ATOMIC_LOAD_XOR:
7618     ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMXOR64_DAG);
7619     return;
7620   case ISD::ATOMIC_SWAP:
7621     ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMSWAP64_DAG);
7622     return;
7623   }
7624 }
7625
7626 const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
7627   switch (Opcode) {
7628   default: return NULL;
7629   case X86ISD::BSF:                return "X86ISD::BSF";
7630   case X86ISD::BSR:                return "X86ISD::BSR";
7631   case X86ISD::SHLD:               return "X86ISD::SHLD";
7632   case X86ISD::SHRD:               return "X86ISD::SHRD";
7633   case X86ISD::FAND:               return "X86ISD::FAND";
7634   case X86ISD::FOR:                return "X86ISD::FOR";
7635   case X86ISD::FXOR:               return "X86ISD::FXOR";
7636   case X86ISD::FSRL:               return "X86ISD::FSRL";
7637   case X86ISD::FILD:               return "X86ISD::FILD";
7638   case X86ISD::FILD_FLAG:          return "X86ISD::FILD_FLAG";
7639   case X86ISD::FP_TO_INT16_IN_MEM: return "X86ISD::FP_TO_INT16_IN_MEM";
7640   case X86ISD::FP_TO_INT32_IN_MEM: return "X86ISD::FP_TO_INT32_IN_MEM";
7641   case X86ISD::FP_TO_INT64_IN_MEM: return "X86ISD::FP_TO_INT64_IN_MEM";
7642   case X86ISD::FLD:                return "X86ISD::FLD";
7643   case X86ISD::FST:                return "X86ISD::FST";
7644   case X86ISD::CALL:               return "X86ISD::CALL";
7645   case X86ISD::RDTSC_DAG:          return "X86ISD::RDTSC_DAG";
7646   case X86ISD::BT:                 return "X86ISD::BT";
7647   case X86ISD::CMP:                return "X86ISD::CMP";
7648   case X86ISD::COMI:               return "X86ISD::COMI";
7649   case X86ISD::UCOMI:              return "X86ISD::UCOMI";
7650   case X86ISD::SETCC:              return "X86ISD::SETCC";
7651   case X86ISD::SETCC_CARRY:        return "X86ISD::SETCC_CARRY";
7652   case X86ISD::CMOV:               return "X86ISD::CMOV";
7653   case X86ISD::BRCOND:             return "X86ISD::BRCOND";
7654   case X86ISD::RET_FLAG:           return "X86ISD::RET_FLAG";
7655   case X86ISD::REP_STOS:           return "X86ISD::REP_STOS";
7656   case X86ISD::REP_MOVS:           return "X86ISD::REP_MOVS";
7657   case X86ISD::GlobalBaseReg:      return "X86ISD::GlobalBaseReg";
7658   case X86ISD::Wrapper:            return "X86ISD::Wrapper";
7659   case X86ISD::WrapperRIP:         return "X86ISD::WrapperRIP";
7660   case X86ISD::PEXTRB:             return "X86ISD::PEXTRB";
7661   case X86ISD::PEXTRW:             return "X86ISD::PEXTRW";
7662   case X86ISD::INSERTPS:           return "X86ISD::INSERTPS";
7663   case X86ISD::PINSRB:             return "X86ISD::PINSRB";
7664   case X86ISD::PINSRW:             return "X86ISD::PINSRW";
7665   case X86ISD::PSHUFB:             return "X86ISD::PSHUFB";
7666   case X86ISD::FMAX:               return "X86ISD::FMAX";
7667   case X86ISD::FMIN:               return "X86ISD::FMIN";
7668   case X86ISD::FRSQRT:             return "X86ISD::FRSQRT";
7669   case X86ISD::FRCP:               return "X86ISD::FRCP";
7670   case X86ISD::TLSADDR:            return "X86ISD::TLSADDR";
7671   case X86ISD::SegmentBaseAddress: return "X86ISD::SegmentBaseAddress";
7672   case X86ISD::EH_RETURN:          return "X86ISD::EH_RETURN";
7673   case X86ISD::TC_RETURN:          return "X86ISD::TC_RETURN";
7674   case X86ISD::FNSTCW16m:          return "X86ISD::FNSTCW16m";
7675   case X86ISD::LCMPXCHG_DAG:       return "X86ISD::LCMPXCHG_DAG";
7676   case X86ISD::LCMPXCHG8_DAG:      return "X86ISD::LCMPXCHG8_DAG";
7677   case X86ISD::ATOMADD64_DAG:      return "X86ISD::ATOMADD64_DAG";
7678   case X86ISD::ATOMSUB64_DAG:      return "X86ISD::ATOMSUB64_DAG";
7679   case X86ISD::ATOMOR64_DAG:       return "X86ISD::ATOMOR64_DAG";
7680   case X86ISD::ATOMXOR64_DAG:      return "X86ISD::ATOMXOR64_DAG";
7681   case X86ISD::ATOMAND64_DAG:      return "X86ISD::ATOMAND64_DAG";
7682   case X86ISD::ATOMNAND64_DAG:     return "X86ISD::ATOMNAND64_DAG";
7683   case X86ISD::VZEXT_MOVL:         return "X86ISD::VZEXT_MOVL";
7684   case X86ISD::VZEXT_LOAD:         return "X86ISD::VZEXT_LOAD";
7685   case X86ISD::VSHL:               return "X86ISD::VSHL";
7686   case X86ISD::VSRL:               return "X86ISD::VSRL";
7687   case X86ISD::CMPPD:              return "X86ISD::CMPPD";
7688   case X86ISD::CMPPS:              return "X86ISD::CMPPS";
7689   case X86ISD::PCMPEQB:            return "X86ISD::PCMPEQB";
7690   case X86ISD::PCMPEQW:            return "X86ISD::PCMPEQW";
7691   case X86ISD::PCMPEQD:            return "X86ISD::PCMPEQD";
7692   case X86ISD::PCMPEQQ:            return "X86ISD::PCMPEQQ";
7693   case X86ISD::PCMPGTB:            return "X86ISD::PCMPGTB";
7694   case X86ISD::PCMPGTW:            return "X86ISD::PCMPGTW";
7695   case X86ISD::PCMPGTD:            return "X86ISD::PCMPGTD";
7696   case X86ISD::PCMPGTQ:            return "X86ISD::PCMPGTQ";
7697   case X86ISD::ADD:                return "X86ISD::ADD";
7698   case X86ISD::SUB:                return "X86ISD::SUB";
7699   case X86ISD::SMUL:               return "X86ISD::SMUL";
7700   case X86ISD::UMUL:               return "X86ISD::UMUL";
7701   case X86ISD::INC:                return "X86ISD::INC";
7702   case X86ISD::DEC:                return "X86ISD::DEC";
7703   case X86ISD::OR:                 return "X86ISD::OR";
7704   case X86ISD::XOR:                return "X86ISD::XOR";
7705   case X86ISD::AND:                return "X86ISD::AND";
7706   case X86ISD::MUL_IMM:            return "X86ISD::MUL_IMM";
7707   case X86ISD::PTEST:              return "X86ISD::PTEST";
7708   case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS";
7709   }
7710 }
7711
7712 // isLegalAddressingMode - Return true if the addressing mode represented
7713 // by AM is legal for this target, for a load/store of the specified type.
7714 bool X86TargetLowering::isLegalAddressingMode(const AddrMode &AM,
7715                                               const Type *Ty) const {
7716   // X86 supports extremely general addressing modes.
7717   CodeModel::Model M = getTargetMachine().getCodeModel();
7718
7719   // X86 allows a sign-extended 32-bit immediate field as a displacement.
7720   if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != NULL))
7721     return false;
7722
7723   if (AM.BaseGV) {
7724     unsigned GVFlags =
7725       Subtarget->ClassifyGlobalReference(AM.BaseGV, getTargetMachine());
7726
7727     // If a reference to this global requires an extra load, we can't fold it.
7728     if (isGlobalStubReference(GVFlags))
7729       return false;
7730
7731     // If BaseGV requires a register for the PIC base, we cannot also have a
7732     // BaseReg specified.
7733     if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags))
7734       return false;
7735
7736     // If lower 4G is not available, then we must use rip-relative addressing.
7737     if (Subtarget->is64Bit() && (AM.BaseOffs || AM.Scale > 1))
7738       return false;
7739   }
7740
7741   switch (AM.Scale) {
7742   case 0:
7743   case 1:
7744   case 2:
7745   case 4:
7746   case 8:
7747     // These scales always work.
7748     break;
7749   case 3:
7750   case 5:
7751   case 9:
7752     // These scales are formed with basereg+scalereg.  Only accept if there is
7753     // no basereg yet.
7754     if (AM.HasBaseReg)
7755       return false;
7756     break;
7757   default:  // Other stuff never works.
7758     return false;
7759   }
7760
7761   return true;
7762 }
7763
7764
7765 bool X86TargetLowering::isTruncateFree(const Type *Ty1, const Type *Ty2) const {
7766   if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
7767     return false;
7768   unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
7769   unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
7770   if (NumBits1 <= NumBits2)
7771     return false;
7772   return Subtarget->is64Bit() || NumBits1 < 64;
7773 }
7774
7775 bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
7776   if (!VT1.isInteger() || !VT2.isInteger())
7777     return false;
7778   unsigned NumBits1 = VT1.getSizeInBits();
7779   unsigned NumBits2 = VT2.getSizeInBits();
7780   if (NumBits1 <= NumBits2)
7781     return false;
7782   return Subtarget->is64Bit() || NumBits1 < 64;
7783 }
7784
7785 bool X86TargetLowering::isZExtFree(const Type *Ty1, const Type *Ty2) const {
7786   // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
7787   return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget->is64Bit();
7788 }
7789
7790 bool X86TargetLowering::isZExtFree(EVT VT1, EVT VT2) const {
7791   // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
7792   return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget->is64Bit();
7793 }
7794
7795 bool X86TargetLowering::isNarrowingProfitable(EVT VT1, EVT VT2) const {
7796   // i16 instructions are longer (0x66 prefix) and potentially slower.
7797   return !(VT1 == MVT::i32 && VT2 == MVT::i16);
7798 }
7799
7800 /// isShuffleMaskLegal - Targets can use this to indicate that they only
7801 /// support *some* VECTOR_SHUFFLE operations, those with specific masks.
7802 /// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
7803 /// are assumed to be legal.
7804 bool
7805 X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M,
7806                                       EVT VT) const {
7807   // Only do shuffles on 128-bit vector types for now.
7808   if (VT.getSizeInBits() == 64)
7809     return false;
7810
7811   // FIXME: pshufb, blends, shifts.
7812   return (VT.getVectorNumElements() == 2 ||
7813           ShuffleVectorSDNode::isSplatMask(&M[0], VT) ||
7814           isMOVLMask(M, VT) ||
7815           isSHUFPMask(M, VT) ||
7816           isPSHUFDMask(M, VT) ||
7817           isPSHUFHWMask(M, VT) ||
7818           isPSHUFLWMask(M, VT) ||
7819           isPALIGNRMask(M, VT, Subtarget->hasSSSE3()) ||
7820           isUNPCKLMask(M, VT) ||
7821           isUNPCKHMask(M, VT) ||
7822           isUNPCKL_v_undef_Mask(M, VT) ||
7823           isUNPCKH_v_undef_Mask(M, VT));
7824 }
7825
7826 bool
7827 X86TargetLowering::isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask,
7828                                           EVT VT) const {
7829   unsigned NumElts = VT.getVectorNumElements();
7830   // FIXME: This collection of masks seems suspect.
7831   if (NumElts == 2)
7832     return true;
7833   if (NumElts == 4 && VT.getSizeInBits() == 128) {
7834     return (isMOVLMask(Mask, VT)  ||
7835             isCommutedMOVLMask(Mask, VT, true) ||
7836             isSHUFPMask(Mask, VT) ||
7837             isCommutedSHUFPMask(Mask, VT));
7838   }
7839   return false;
7840 }
7841
7842 //===----------------------------------------------------------------------===//
7843 //                           X86 Scheduler Hooks
7844 //===----------------------------------------------------------------------===//
7845
7846 // private utility function
7847 MachineBasicBlock *
7848 X86TargetLowering::EmitAtomicBitwiseWithCustomInserter(MachineInstr *bInstr,
7849                                                        MachineBasicBlock *MBB,
7850                                                        unsigned regOpc,
7851                                                        unsigned immOpc,
7852                                                        unsigned LoadOpc,
7853                                                        unsigned CXchgOpc,
7854                                                        unsigned copyOpc,
7855                                                        unsigned notOpc,
7856                                                        unsigned EAXreg,
7857                                                        TargetRegisterClass *RC,
7858                                                        bool invSrc) const {
7859   // For the atomic bitwise operator, we generate
7860   //   thisMBB:
7861   //   newMBB:
7862   //     ld  t1 = [bitinstr.addr]
7863   //     op  t2 = t1, [bitinstr.val]
7864   //     mov EAX = t1
7865   //     lcs dest = [bitinstr.addr], t2  [EAX is implicit]
7866   //     bz  newMBB
7867   //     fallthrough -->nextMBB
7868   const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
7869   const BasicBlock *LLVM_BB = MBB->getBasicBlock();
7870   MachineFunction::iterator MBBIter = MBB;
7871   ++MBBIter;
7872
7873   /// First build the CFG
7874   MachineFunction *F = MBB->getParent();
7875   MachineBasicBlock *thisMBB = MBB;
7876   MachineBasicBlock *newMBB = F->CreateMachineBasicBlock(LLVM_BB);
7877   MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB);
7878   F->insert(MBBIter, newMBB);
7879   F->insert(MBBIter, nextMBB);
7880
7881   // Move all successors to thisMBB to nextMBB
7882   nextMBB->transferSuccessors(thisMBB);
7883
7884   // Update thisMBB to fall through to newMBB
7885   thisMBB->addSuccessor(newMBB);
7886
7887   // newMBB jumps to itself and fall through to nextMBB
7888   newMBB->addSuccessor(nextMBB);
7889   newMBB->addSuccessor(newMBB);
7890
7891   // Insert instructions into newMBB based on incoming instruction
7892   assert(bInstr->getNumOperands() < X86AddrNumOperands + 4 &&
7893          "unexpected number of operands");
7894   DebugLoc dl = bInstr->getDebugLoc();
7895   MachineOperand& destOper = bInstr->getOperand(0);
7896   MachineOperand* argOpers[2 + X86AddrNumOperands];
7897   int numArgs = bInstr->getNumOperands() - 1;
7898   for (int i=0; i < numArgs; ++i)
7899     argOpers[i] = &bInstr->getOperand(i+1);
7900
7901   // x86 address has 4 operands: base, index, scale, and displacement
7902   int lastAddrIndx = X86AddrNumOperands - 1; // [0,3]
7903   int valArgIndx = lastAddrIndx + 1;
7904
7905   unsigned t1 = F->getRegInfo().createVirtualRegister(RC);
7906   MachineInstrBuilder MIB = BuildMI(newMBB, dl, TII->get(LoadOpc), t1);
7907   for (int i=0; i <= lastAddrIndx; ++i)
7908     (*MIB).addOperand(*argOpers[i]);
7909
7910   unsigned tt = F->getRegInfo().createVirtualRegister(RC);
7911   if (invSrc) {
7912     MIB = BuildMI(newMBB, dl, TII->get(notOpc), tt).addReg(t1);
7913   }
7914   else
7915     tt = t1;
7916
7917   unsigned t2 = F->getRegInfo().createVirtualRegister(RC);
7918   assert((argOpers[valArgIndx]->isReg() ||
7919           argOpers[valArgIndx]->isImm()) &&
7920          "invalid operand");
7921   if (argOpers[valArgIndx]->isReg())
7922     MIB = BuildMI(newMBB, dl, TII->get(regOpc), t2);
7923   else
7924     MIB = BuildMI(newMBB, dl, TII->get(immOpc), t2);
7925   MIB.addReg(tt);
7926   (*MIB).addOperand(*argOpers[valArgIndx]);
7927
7928   MIB = BuildMI(newMBB, dl, TII->get(copyOpc), EAXreg);
7929   MIB.addReg(t1);
7930
7931   MIB = BuildMI(newMBB, dl, TII->get(CXchgOpc));
7932   for (int i=0; i <= lastAddrIndx; ++i)
7933     (*MIB).addOperand(*argOpers[i]);
7934   MIB.addReg(t2);
7935   assert(bInstr->hasOneMemOperand() && "Unexpected number of memoperand");
7936   (*MIB).setMemRefs(bInstr->memoperands_begin(),
7937                     bInstr->memoperands_end());
7938
7939   MIB = BuildMI(newMBB, dl, TII->get(copyOpc), destOper.getReg());
7940   MIB.addReg(EAXreg);
7941
7942   // insert branch
7943   BuildMI(newMBB, dl, TII->get(X86::JNE_4)).addMBB(newMBB);
7944
7945   F->DeleteMachineInstr(bInstr);   // The pseudo instruction is gone now.
7946   return nextMBB;
7947 }
7948
7949 // private utility function:  64 bit atomics on 32 bit host.
7950 MachineBasicBlock *
7951 X86TargetLowering::EmitAtomicBit6432WithCustomInserter(MachineInstr *bInstr,
7952                                                        MachineBasicBlock *MBB,
7953                                                        unsigned regOpcL,
7954                                                        unsigned regOpcH,
7955                                                        unsigned immOpcL,
7956                                                        unsigned immOpcH,
7957                                                        bool invSrc) const {
7958   // For the atomic bitwise operator, we generate
7959   //   thisMBB (instructions are in pairs, except cmpxchg8b)
7960   //     ld t1,t2 = [bitinstr.addr]
7961   //   newMBB:
7962   //     out1, out2 = phi (thisMBB, t1/t2) (newMBB, t3/t4)
7963   //     op  t5, t6 <- out1, out2, [bitinstr.val]
7964   //      (for SWAP, substitute:  mov t5, t6 <- [bitinstr.val])
7965   //     mov ECX, EBX <- t5, t6
7966   //     mov EAX, EDX <- t1, t2
7967   //     cmpxchg8b [bitinstr.addr]  [EAX, EDX, EBX, ECX implicit]
7968   //     mov t3, t4 <- EAX, EDX
7969   //     bz  newMBB
7970   //     result in out1, out2
7971   //     fallthrough -->nextMBB
7972
7973   const TargetRegisterClass *RC = X86::GR32RegisterClass;
7974   const unsigned LoadOpc = X86::MOV32rm;
7975   const unsigned copyOpc = X86::MOV32rr;
7976   const unsigned NotOpc = X86::NOT32r;
7977   const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
7978   const BasicBlock *LLVM_BB = MBB->getBasicBlock();
7979   MachineFunction::iterator MBBIter = MBB;
7980   ++MBBIter;
7981
7982   /// First build the CFG
7983   MachineFunction *F = MBB->getParent();
7984   MachineBasicBlock *thisMBB = MBB;
7985   MachineBasicBlock *newMBB = F->CreateMachineBasicBlock(LLVM_BB);
7986   MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB);
7987   F->insert(MBBIter, newMBB);
7988   F->insert(MBBIter, nextMBB);
7989
7990   // Move all successors to thisMBB to nextMBB
7991   nextMBB->transferSuccessors(thisMBB);
7992
7993   // Update thisMBB to fall through to newMBB
7994   thisMBB->addSuccessor(newMBB);
7995
7996   // newMBB jumps to itself and fall through to nextMBB
7997   newMBB->addSuccessor(nextMBB);
7998   newMBB->addSuccessor(newMBB);
7999
8000   DebugLoc dl = bInstr->getDebugLoc();
8001   // Insert instructions into newMBB based on incoming instruction
8002   // There are 8 "real" operands plus 9 implicit def/uses, ignored here.
8003   assert(bInstr->getNumOperands() < X86AddrNumOperands + 14 &&
8004          "unexpected number of operands");
8005   MachineOperand& dest1Oper = bInstr->getOperand(0);
8006   MachineOperand& dest2Oper = bInstr->getOperand(1);
8007   MachineOperand* argOpers[2 + X86AddrNumOperands];
8008   for (int i=0; i < 2 + X86AddrNumOperands; ++i)
8009     argOpers[i] = &bInstr->getOperand(i+2);
8010
8011   // x86 address has 5 operands: base, index, scale, displacement, and segment.
8012   int lastAddrIndx = X86AddrNumOperands - 1; // [0,3]
8013
8014   unsigned t1 = F->getRegInfo().createVirtualRegister(RC);
8015   MachineInstrBuilder MIB = BuildMI(thisMBB, dl, TII->get(LoadOpc), t1);
8016   for (int i=0; i <= lastAddrIndx; ++i)
8017     (*MIB).addOperand(*argOpers[i]);
8018   unsigned t2 = F->getRegInfo().createVirtualRegister(RC);
8019   MIB = BuildMI(thisMBB, dl, TII->get(LoadOpc), t2);
8020   // add 4 to displacement.
8021   for (int i=0; i <= lastAddrIndx-2; ++i)
8022     (*MIB).addOperand(*argOpers[i]);
8023   MachineOperand newOp3 = *(argOpers[3]);
8024   if (newOp3.isImm())
8025     newOp3.setImm(newOp3.getImm()+4);
8026   else
8027     newOp3.setOffset(newOp3.getOffset()+4);
8028   (*MIB).addOperand(newOp3);
8029   (*MIB).addOperand(*argOpers[lastAddrIndx]);
8030
8031   // t3/4 are defined later, at the bottom of the loop
8032   unsigned t3 = F->getRegInfo().createVirtualRegister(RC);
8033   unsigned t4 = F->getRegInfo().createVirtualRegister(RC);
8034   BuildMI(newMBB, dl, TII->get(X86::PHI), dest1Oper.getReg())
8035     .addReg(t1).addMBB(thisMBB).addReg(t3).addMBB(newMBB);
8036   BuildMI(newMBB, dl, TII->get(X86::PHI), dest2Oper.getReg())
8037     .addReg(t2).addMBB(thisMBB).addReg(t4).addMBB(newMBB);
8038
8039   // The subsequent operations should be using the destination registers of
8040   //the PHI instructions.
8041   if (invSrc) {
8042     t1 = F->getRegInfo().createVirtualRegister(RC);
8043     t2 = F->getRegInfo().createVirtualRegister(RC);
8044     MIB = BuildMI(newMBB, dl, TII->get(NotOpc), t1).addReg(dest1Oper.getReg());
8045     MIB = BuildMI(newMBB, dl, TII->get(NotOpc), t2).addReg(dest2Oper.getReg());
8046   } else {
8047     t1 = dest1Oper.getReg();
8048     t2 = dest2Oper.getReg();
8049   }
8050
8051   int valArgIndx = lastAddrIndx + 1;
8052   assert((argOpers[valArgIndx]->isReg() ||
8053           argOpers[valArgIndx]->isImm()) &&
8054          "invalid operand");
8055   unsigned t5 = F->getRegInfo().createVirtualRegister(RC);
8056   unsigned t6 = F->getRegInfo().createVirtualRegister(RC);
8057   if (argOpers[valArgIndx]->isReg())
8058     MIB = BuildMI(newMBB, dl, TII->get(regOpcL), t5);
8059   else
8060     MIB = BuildMI(newMBB, dl, TII->get(immOpcL), t5);
8061   if (regOpcL != X86::MOV32rr)
8062     MIB.addReg(t1);
8063   (*MIB).addOperand(*argOpers[valArgIndx]);
8064   assert(argOpers[valArgIndx + 1]->isReg() ==
8065          argOpers[valArgIndx]->isReg());
8066   assert(argOpers[valArgIndx + 1]->isImm() ==
8067          argOpers[valArgIndx]->isImm());
8068   if (argOpers[valArgIndx + 1]->isReg())
8069     MIB = BuildMI(newMBB, dl, TII->get(regOpcH), t6);
8070   else
8071     MIB = BuildMI(newMBB, dl, TII->get(immOpcH), t6);
8072   if (regOpcH != X86::MOV32rr)
8073     MIB.addReg(t2);
8074   (*MIB).addOperand(*argOpers[valArgIndx + 1]);
8075
8076   MIB = BuildMI(newMBB, dl, TII->get(copyOpc), X86::EAX);
8077   MIB.addReg(t1);
8078   MIB = BuildMI(newMBB, dl, TII->get(copyOpc), X86::EDX);
8079   MIB.addReg(t2);
8080
8081   MIB = BuildMI(newMBB, dl, TII->get(copyOpc), X86::EBX);
8082   MIB.addReg(t5);
8083   MIB = BuildMI(newMBB, dl, TII->get(copyOpc), X86::ECX);
8084   MIB.addReg(t6);
8085
8086   MIB = BuildMI(newMBB, dl, TII->get(X86::LCMPXCHG8B));
8087   for (int i=0; i <= lastAddrIndx; ++i)
8088     (*MIB).addOperand(*argOpers[i]);
8089
8090   assert(bInstr->hasOneMemOperand() && "Unexpected number of memoperand");
8091   (*MIB).setMemRefs(bInstr->memoperands_begin(),
8092                     bInstr->memoperands_end());
8093
8094   MIB = BuildMI(newMBB, dl, TII->get(copyOpc), t3);
8095   MIB.addReg(X86::EAX);
8096   MIB = BuildMI(newMBB, dl, TII->get(copyOpc), t4);
8097   MIB.addReg(X86::EDX);
8098
8099   // insert branch
8100   BuildMI(newMBB, dl, TII->get(X86::JNE_4)).addMBB(newMBB);
8101
8102   F->DeleteMachineInstr(bInstr);   // The pseudo instruction is gone now.
8103   return nextMBB;
8104 }
8105
8106 // private utility function
8107 MachineBasicBlock *
8108 X86TargetLowering::EmitAtomicMinMaxWithCustomInserter(MachineInstr *mInstr,
8109                                                       MachineBasicBlock *MBB,
8110                                                       unsigned cmovOpc) const {
8111   // For the atomic min/max operator, we generate
8112   //   thisMBB:
8113   //   newMBB:
8114   //     ld t1 = [min/max.addr]
8115   //     mov t2 = [min/max.val]
8116   //     cmp  t1, t2
8117   //     cmov[cond] t2 = t1
8118   //     mov EAX = t1
8119   //     lcs dest = [bitinstr.addr], t2  [EAX is implicit]
8120   //     bz   newMBB
8121   //     fallthrough -->nextMBB
8122   //
8123   const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
8124   const BasicBlock *LLVM_BB = MBB->getBasicBlock();
8125   MachineFunction::iterator MBBIter = MBB;
8126   ++MBBIter;
8127
8128   /// First build the CFG
8129   MachineFunction *F = MBB->getParent();
8130   MachineBasicBlock *thisMBB = MBB;
8131   MachineBasicBlock *newMBB = F->CreateMachineBasicBlock(LLVM_BB);
8132   MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB);
8133   F->insert(MBBIter, newMBB);
8134   F->insert(MBBIter, nextMBB);
8135
8136   // Move all successors of thisMBB to nextMBB
8137   nextMBB->transferSuccessors(thisMBB);
8138
8139   // Update thisMBB to fall through to newMBB
8140   thisMBB->addSuccessor(newMBB);
8141
8142   // newMBB jumps to newMBB and fall through to nextMBB
8143   newMBB->addSuccessor(nextMBB);
8144   newMBB->addSuccessor(newMBB);
8145
8146   DebugLoc dl = mInstr->getDebugLoc();
8147   // Insert instructions into newMBB based on incoming instruction
8148   assert(mInstr->getNumOperands() < X86AddrNumOperands + 4 &&
8149          "unexpected number of operands");
8150   MachineOperand& destOper = mInstr->getOperand(0);
8151   MachineOperand* argOpers[2 + X86AddrNumOperands];
8152   int numArgs = mInstr->getNumOperands() - 1;
8153   for (int i=0; i < numArgs; ++i)
8154     argOpers[i] = &mInstr->getOperand(i+1);
8155
8156   // x86 address has 4 operands: base, index, scale, and displacement
8157   int lastAddrIndx = X86AddrNumOperands - 1; // [0,3]
8158   int valArgIndx = lastAddrIndx + 1;
8159
8160   unsigned t1 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass);
8161   MachineInstrBuilder MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rm), t1);
8162   for (int i=0; i <= lastAddrIndx; ++i)
8163     (*MIB).addOperand(*argOpers[i]);
8164
8165   // We only support register and immediate values
8166   assert((argOpers[valArgIndx]->isReg() ||
8167           argOpers[valArgIndx]->isImm()) &&
8168          "invalid operand");
8169
8170   unsigned t2 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass);
8171   if (argOpers[valArgIndx]->isReg())
8172     MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rr), t2);
8173   else
8174     MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rr), t2);
8175   (*MIB).addOperand(*argOpers[valArgIndx]);
8176
8177   MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rr), X86::EAX);
8178   MIB.addReg(t1);
8179
8180   MIB = BuildMI(newMBB, dl, TII->get(X86::CMP32rr));
8181   MIB.addReg(t1);
8182   MIB.addReg(t2);
8183
8184   // Generate movc
8185   unsigned t3 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass);
8186   MIB = BuildMI(newMBB, dl, TII->get(cmovOpc),t3);
8187   MIB.addReg(t2);
8188   MIB.addReg(t1);
8189
8190   // Cmp and exchange if none has modified the memory location
8191   MIB = BuildMI(newMBB, dl, TII->get(X86::LCMPXCHG32));
8192   for (int i=0; i <= lastAddrIndx; ++i)
8193     (*MIB).addOperand(*argOpers[i]);
8194   MIB.addReg(t3);
8195   assert(mInstr->hasOneMemOperand() && "Unexpected number of memoperand");
8196   (*MIB).setMemRefs(mInstr->memoperands_begin(),
8197                     mInstr->memoperands_end());
8198
8199   MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rr), destOper.getReg());
8200   MIB.addReg(X86::EAX);
8201
8202   // insert branch
8203   BuildMI(newMBB, dl, TII->get(X86::JNE_4)).addMBB(newMBB);
8204
8205   F->DeleteMachineInstr(mInstr);   // The pseudo instruction is gone now.
8206   return nextMBB;
8207 }
8208
8209 // FIXME: When we get size specific XMM0 registers, i.e. XMM0_V16I8
8210 // all of this code can be replaced with that in the .td file.
8211 MachineBasicBlock *
8212 X86TargetLowering::EmitPCMP(MachineInstr *MI, MachineBasicBlock *BB,
8213                             unsigned numArgs, bool memArg) const {
8214
8215   MachineFunction *F = BB->getParent();
8216   DebugLoc dl = MI->getDebugLoc();
8217   const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
8218
8219   unsigned Opc;
8220   if (memArg)
8221     Opc = numArgs == 3 ? X86::PCMPISTRM128rm : X86::PCMPESTRM128rm;
8222   else
8223     Opc = numArgs == 3 ? X86::PCMPISTRM128rr : X86::PCMPESTRM128rr;
8224
8225   MachineInstrBuilder MIB = BuildMI(BB, dl, TII->get(Opc));
8226
8227   for (unsigned i = 0; i < numArgs; ++i) {
8228     MachineOperand &Op = MI->getOperand(i+1);
8229
8230     if (!(Op.isReg() && Op.isImplicit()))
8231       MIB.addOperand(Op);
8232   }
8233
8234   BuildMI(BB, dl, TII->get(X86::MOVAPSrr), MI->getOperand(0).getReg())
8235     .addReg(X86::XMM0);
8236
8237   F->DeleteMachineInstr(MI);
8238
8239   return BB;
8240 }
8241
8242 MachineBasicBlock *
8243 X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter(
8244                                                  MachineInstr *MI,
8245                                                  MachineBasicBlock *MBB) const {
8246   // Emit code to save XMM registers to the stack. The ABI says that the
8247   // number of registers to save is given in %al, so it's theoretically
8248   // possible to do an indirect jump trick to avoid saving all of them,
8249   // however this code takes a simpler approach and just executes all
8250   // of the stores if %al is non-zero. It's less code, and it's probably
8251   // easier on the hardware branch predictor, and stores aren't all that
8252   // expensive anyway.
8253
8254   // Create the new basic blocks. One block contains all the XMM stores,
8255   // and one block is the final destination regardless of whether any
8256   // stores were performed.
8257   const BasicBlock *LLVM_BB = MBB->getBasicBlock();
8258   MachineFunction *F = MBB->getParent();
8259   MachineFunction::iterator MBBIter = MBB;
8260   ++MBBIter;
8261   MachineBasicBlock *XMMSaveMBB = F->CreateMachineBasicBlock(LLVM_BB);
8262   MachineBasicBlock *EndMBB = F->CreateMachineBasicBlock(LLVM_BB);
8263   F->insert(MBBIter, XMMSaveMBB);
8264   F->insert(MBBIter, EndMBB);
8265
8266   // Set up the CFG.
8267   // Move any original successors of MBB to the end block.
8268   EndMBB->transferSuccessors(MBB);
8269   // The original block will now fall through to the XMM save block.
8270   MBB->addSuccessor(XMMSaveMBB);
8271   // The XMMSaveMBB will fall through to the end block.
8272   XMMSaveMBB->addSuccessor(EndMBB);
8273
8274   // Now add the instructions.
8275   const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
8276   DebugLoc DL = MI->getDebugLoc();
8277
8278   unsigned CountReg = MI->getOperand(0).getReg();
8279   int64_t RegSaveFrameIndex = MI->getOperand(1).getImm();
8280   int64_t VarArgsFPOffset = MI->getOperand(2).getImm();
8281
8282   if (!Subtarget->isTargetWin64()) {
8283     // If %al is 0, branch around the XMM save block.
8284     BuildMI(MBB, DL, TII->get(X86::TEST8rr)).addReg(CountReg).addReg(CountReg);
8285     BuildMI(MBB, DL, TII->get(X86::JE_4)).addMBB(EndMBB);
8286     MBB->addSuccessor(EndMBB);
8287   }
8288
8289   // In the XMM save block, save all the XMM argument registers.
8290   for (int i = 3, e = MI->getNumOperands(); i != e; ++i) {
8291     int64_t Offset = (i - 3) * 16 + VarArgsFPOffset;
8292     MachineMemOperand *MMO =
8293       F->getMachineMemOperand(
8294         PseudoSourceValue::getFixedStack(RegSaveFrameIndex),
8295         MachineMemOperand::MOStore, Offset,
8296         /*Size=*/16, /*Align=*/16);
8297     BuildMI(XMMSaveMBB, DL, TII->get(X86::MOVAPSmr))
8298       .addFrameIndex(RegSaveFrameIndex)
8299       .addImm(/*Scale=*/1)
8300       .addReg(/*IndexReg=*/0)
8301       .addImm(/*Disp=*/Offset)
8302       .addReg(/*Segment=*/0)
8303       .addReg(MI->getOperand(i).getReg())
8304       .addMemOperand(MMO);
8305   }
8306
8307   F->DeleteMachineInstr(MI);   // The pseudo instruction is gone now.
8308
8309   return EndMBB;
8310 }
8311
8312 MachineBasicBlock *
8313 X86TargetLowering::EmitLoweredSelect(MachineInstr *MI,
8314                                      MachineBasicBlock *BB,
8315                    DenseMap<MachineBasicBlock*, MachineBasicBlock*> *EM) const {
8316   const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
8317   DebugLoc DL = MI->getDebugLoc();
8318
8319   // To "insert" a SELECT_CC instruction, we actually have to insert the
8320   // diamond control-flow pattern.  The incoming instruction knows the
8321   // destination vreg to set, the condition code register to branch on, the
8322   // true/false values to select between, and a branch opcode to use.
8323   const BasicBlock *LLVM_BB = BB->getBasicBlock();
8324   MachineFunction::iterator It = BB;
8325   ++It;
8326
8327   //  thisMBB:
8328   //  ...
8329   //   TrueVal = ...
8330   //   cmpTY ccX, r1, r2
8331   //   bCC copy1MBB
8332   //   fallthrough --> copy0MBB
8333   MachineBasicBlock *thisMBB = BB;
8334   MachineFunction *F = BB->getParent();
8335   MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
8336   MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
8337   unsigned Opc =
8338     X86::GetCondBranchFromCond((X86::CondCode)MI->getOperand(3).getImm());
8339   BuildMI(BB, DL, TII->get(Opc)).addMBB(sinkMBB);
8340   F->insert(It, copy0MBB);
8341   F->insert(It, sinkMBB);
8342   // Update machine-CFG edges by first adding all successors of the current
8343   // block to the new block which will contain the Phi node for the select.
8344   // Also inform sdisel of the edge changes.
8345   for (MachineBasicBlock::succ_iterator I = BB->succ_begin(),
8346          E = BB->succ_end(); I != E; ++I) {
8347     EM->insert(std::make_pair(*I, sinkMBB));
8348     sinkMBB->addSuccessor(*I);
8349   }
8350   // Next, remove all successors of the current block, and add the true
8351   // and fallthrough blocks as its successors.
8352   while (!BB->succ_empty())
8353     BB->removeSuccessor(BB->succ_begin());
8354   // Add the true and fallthrough blocks as its successors.
8355   BB->addSuccessor(copy0MBB);
8356   BB->addSuccessor(sinkMBB);
8357
8358   //  copy0MBB:
8359   //   %FalseValue = ...
8360   //   # fallthrough to sinkMBB
8361   BB = copy0MBB;
8362
8363   // Update machine-CFG edges
8364   BB->addSuccessor(sinkMBB);
8365
8366   //  sinkMBB:
8367   //   %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
8368   //  ...
8369   BB = sinkMBB;
8370   BuildMI(BB, DL, TII->get(X86::PHI), MI->getOperand(0).getReg())
8371     .addReg(MI->getOperand(1).getReg()).addMBB(copy0MBB)
8372     .addReg(MI->getOperand(2).getReg()).addMBB(thisMBB);
8373
8374   F->DeleteMachineInstr(MI);   // The pseudo instruction is gone now.
8375   return BB;
8376 }
8377
8378
8379 MachineBasicBlock *
8380 X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
8381                                                MachineBasicBlock *BB,
8382                    DenseMap<MachineBasicBlock*, MachineBasicBlock*> *EM) const {
8383   switch (MI->getOpcode()) {
8384   default: assert(false && "Unexpected instr type to insert");
8385   case X86::CMOV_GR8:
8386   case X86::CMOV_V1I64:
8387   case X86::CMOV_FR32:
8388   case X86::CMOV_FR64:
8389   case X86::CMOV_V4F32:
8390   case X86::CMOV_V2F64:
8391   case X86::CMOV_V2I64:
8392     return EmitLoweredSelect(MI, BB, EM);
8393
8394   case X86::FP32_TO_INT16_IN_MEM:
8395   case X86::FP32_TO_INT32_IN_MEM:
8396   case X86::FP32_TO_INT64_IN_MEM:
8397   case X86::FP64_TO_INT16_IN_MEM:
8398   case X86::FP64_TO_INT32_IN_MEM:
8399   case X86::FP64_TO_INT64_IN_MEM:
8400   case X86::FP80_TO_INT16_IN_MEM:
8401   case X86::FP80_TO_INT32_IN_MEM:
8402   case X86::FP80_TO_INT64_IN_MEM: {
8403     const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
8404     DebugLoc DL = MI->getDebugLoc();
8405
8406     // Change the floating point control register to use "round towards zero"
8407     // mode when truncating to an integer value.
8408     MachineFunction *F = BB->getParent();
8409     int CWFrameIdx = F->getFrameInfo()->CreateStackObject(2, 2, false);
8410     addFrameReference(BuildMI(BB, DL, TII->get(X86::FNSTCW16m)), CWFrameIdx);
8411
8412     // Load the old value of the high byte of the control word...
8413     unsigned OldCW =
8414       F->getRegInfo().createVirtualRegister(X86::GR16RegisterClass);
8415     addFrameReference(BuildMI(BB, DL, TII->get(X86::MOV16rm), OldCW),
8416                       CWFrameIdx);
8417
8418     // Set the high part to be round to zero...
8419     addFrameReference(BuildMI(BB, DL, TII->get(X86::MOV16mi)), CWFrameIdx)
8420       .addImm(0xC7F);
8421
8422     // Reload the modified control word now...
8423     addFrameReference(BuildMI(BB, DL, TII->get(X86::FLDCW16m)), CWFrameIdx);
8424
8425     // Restore the memory image of control word to original value
8426     addFrameReference(BuildMI(BB, DL, TII->get(X86::MOV16mr)), CWFrameIdx)
8427       .addReg(OldCW);
8428
8429     // Get the X86 opcode to use.
8430     unsigned Opc;
8431     switch (MI->getOpcode()) {
8432     default: llvm_unreachable("illegal opcode!");
8433     case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break;
8434     case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break;
8435     case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break;
8436     case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break;
8437     case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break;
8438     case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break;
8439     case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break;
8440     case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break;
8441     case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break;
8442     }
8443
8444     X86AddressMode AM;
8445     MachineOperand &Op = MI->getOperand(0);
8446     if (Op.isReg()) {
8447       AM.BaseType = X86AddressMode::RegBase;
8448       AM.Base.Reg = Op.getReg();
8449     } else {
8450       AM.BaseType = X86AddressMode::FrameIndexBase;
8451       AM.Base.FrameIndex = Op.getIndex();
8452     }
8453     Op = MI->getOperand(1);
8454     if (Op.isImm())
8455       AM.Scale = Op.getImm();
8456     Op = MI->getOperand(2);
8457     if (Op.isImm())
8458       AM.IndexReg = Op.getImm();
8459     Op = MI->getOperand(3);
8460     if (Op.isGlobal()) {
8461       AM.GV = Op.getGlobal();
8462     } else {
8463       AM.Disp = Op.getImm();
8464     }
8465     addFullAddress(BuildMI(BB, DL, TII->get(Opc)), AM)
8466                       .addReg(MI->getOperand(X86AddrNumOperands).getReg());
8467
8468     // Reload the original control word now.
8469     addFrameReference(BuildMI(BB, DL, TII->get(X86::FLDCW16m)), CWFrameIdx);
8470
8471     F->DeleteMachineInstr(MI);   // The pseudo instruction is gone now.
8472     return BB;
8473   }
8474     // String/text processing lowering.
8475   case X86::PCMPISTRM128REG:
8476     return EmitPCMP(MI, BB, 3, false /* in-mem */);
8477   case X86::PCMPISTRM128MEM:
8478     return EmitPCMP(MI, BB, 3, true /* in-mem */);
8479   case X86::PCMPESTRM128REG:
8480     return EmitPCMP(MI, BB, 5, false /* in mem */);
8481   case X86::PCMPESTRM128MEM:
8482     return EmitPCMP(MI, BB, 5, true /* in mem */);
8483
8484     // Atomic Lowering.
8485   case X86::ATOMAND32:
8486     return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND32rr,
8487                                                X86::AND32ri, X86::MOV32rm,
8488                                                X86::LCMPXCHG32, X86::MOV32rr,
8489                                                X86::NOT32r, X86::EAX,
8490                                                X86::GR32RegisterClass);
8491   case X86::ATOMOR32:
8492     return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR32rr,
8493                                                X86::OR32ri, X86::MOV32rm,
8494                                                X86::LCMPXCHG32, X86::MOV32rr,
8495                                                X86::NOT32r, X86::EAX,
8496                                                X86::GR32RegisterClass);
8497   case X86::ATOMXOR32:
8498     return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR32rr,
8499                                                X86::XOR32ri, X86::MOV32rm,
8500                                                X86::LCMPXCHG32, X86::MOV32rr,
8501                                                X86::NOT32r, X86::EAX,
8502                                                X86::GR32RegisterClass);
8503   case X86::ATOMNAND32:
8504     return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND32rr,
8505                                                X86::AND32ri, X86::MOV32rm,
8506                                                X86::LCMPXCHG32, X86::MOV32rr,
8507                                                X86::NOT32r, X86::EAX,
8508                                                X86::GR32RegisterClass, true);
8509   case X86::ATOMMIN32:
8510     return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL32rr);
8511   case X86::ATOMMAX32:
8512     return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVG32rr);
8513   case X86::ATOMUMIN32:
8514     return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVB32rr);
8515   case X86::ATOMUMAX32:
8516     return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVA32rr);
8517
8518   case X86::ATOMAND16:
8519     return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND16rr,
8520                                                X86::AND16ri, X86::MOV16rm,
8521                                                X86::LCMPXCHG16, X86::MOV16rr,
8522                                                X86::NOT16r, X86::AX,
8523                                                X86::GR16RegisterClass);
8524   case X86::ATOMOR16:
8525     return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR16rr,
8526                                                X86::OR16ri, X86::MOV16rm,
8527                                                X86::LCMPXCHG16, X86::MOV16rr,
8528                                                X86::NOT16r, X86::AX,
8529                                                X86::GR16RegisterClass);
8530   case X86::ATOMXOR16:
8531     return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR16rr,
8532                                                X86::XOR16ri, X86::MOV16rm,
8533                                                X86::LCMPXCHG16, X86::MOV16rr,
8534                                                X86::NOT16r, X86::AX,
8535                                                X86::GR16RegisterClass);
8536   case X86::ATOMNAND16:
8537     return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND16rr,
8538                                                X86::AND16ri, X86::MOV16rm,
8539                                                X86::LCMPXCHG16, X86::MOV16rr,
8540                                                X86::NOT16r, X86::AX,
8541                                                X86::GR16RegisterClass, true);
8542   case X86::ATOMMIN16:
8543     return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL16rr);
8544   case X86::ATOMMAX16:
8545     return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVG16rr);
8546   case X86::ATOMUMIN16:
8547     return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVB16rr);
8548   case X86::ATOMUMAX16:
8549     return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVA16rr);
8550
8551   case X86::ATOMAND8:
8552     return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND8rr,
8553                                                X86::AND8ri, X86::MOV8rm,
8554                                                X86::LCMPXCHG8, X86::MOV8rr,
8555                                                X86::NOT8r, X86::AL,
8556                                                X86::GR8RegisterClass);
8557   case X86::ATOMOR8:
8558     return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR8rr,
8559                                                X86::OR8ri, X86::MOV8rm,
8560                                                X86::LCMPXCHG8, X86::MOV8rr,
8561                                                X86::NOT8r, X86::AL,
8562                                                X86::GR8RegisterClass);
8563   case X86::ATOMXOR8:
8564     return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR8rr,
8565                                                X86::XOR8ri, X86::MOV8rm,
8566                                                X86::LCMPXCHG8, X86::MOV8rr,
8567                                                X86::NOT8r, X86::AL,
8568                                                X86::GR8RegisterClass);
8569   case X86::ATOMNAND8:
8570     return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND8rr,
8571                                                X86::AND8ri, X86::MOV8rm,
8572                                                X86::LCMPXCHG8, X86::MOV8rr,
8573                                                X86::NOT8r, X86::AL,
8574                                                X86::GR8RegisterClass, true);
8575   // FIXME: There are no CMOV8 instructions; MIN/MAX need some other way.
8576   // This group is for 64-bit host.
8577   case X86::ATOMAND64:
8578     return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND64rr,
8579                                                X86::AND64ri32, X86::MOV64rm,
8580                                                X86::LCMPXCHG64, X86::MOV64rr,
8581                                                X86::NOT64r, X86::RAX,
8582                                                X86::GR64RegisterClass);
8583   case X86::ATOMOR64:
8584     return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR64rr,
8585                                                X86::OR64ri32, X86::MOV64rm,
8586                                                X86::LCMPXCHG64, X86::MOV64rr,
8587                                                X86::NOT64r, X86::RAX,
8588                                                X86::GR64RegisterClass);
8589   case X86::ATOMXOR64:
8590     return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR64rr,
8591                                                X86::XOR64ri32, X86::MOV64rm,
8592                                                X86::LCMPXCHG64, X86::MOV64rr,
8593                                                X86::NOT64r, X86::RAX,
8594                                                X86::GR64RegisterClass);
8595   case X86::ATOMNAND64:
8596     return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND64rr,
8597                                                X86::AND64ri32, X86::MOV64rm,
8598                                                X86::LCMPXCHG64, X86::MOV64rr,
8599                                                X86::NOT64r, X86::RAX,
8600                                                X86::GR64RegisterClass, true);
8601   case X86::ATOMMIN64:
8602     return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL64rr);
8603   case X86::ATOMMAX64:
8604     return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVG64rr);
8605   case X86::ATOMUMIN64:
8606     return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVB64rr);
8607   case X86::ATOMUMAX64:
8608     return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVA64rr);
8609
8610   // This group does 64-bit operations on a 32-bit host.
8611   case X86::ATOMAND6432:
8612     return EmitAtomicBit6432WithCustomInserter(MI, BB,
8613                                                X86::AND32rr, X86::AND32rr,
8614                                                X86::AND32ri, X86::AND32ri,
8615                                                false);
8616   case X86::ATOMOR6432:
8617     return EmitAtomicBit6432WithCustomInserter(MI, BB,
8618                                                X86::OR32rr, X86::OR32rr,
8619                                                X86::OR32ri, X86::OR32ri,
8620                                                false);
8621   case X86::ATOMXOR6432:
8622     return EmitAtomicBit6432WithCustomInserter(MI, BB,
8623                                                X86::XOR32rr, X86::XOR32rr,
8624                                                X86::XOR32ri, X86::XOR32ri,
8625                                                false);
8626   case X86::ATOMNAND6432:
8627     return EmitAtomicBit6432WithCustomInserter(MI, BB,
8628                                                X86::AND32rr, X86::AND32rr,
8629                                                X86::AND32ri, X86::AND32ri,
8630                                                true);
8631   case X86::ATOMADD6432:
8632     return EmitAtomicBit6432WithCustomInserter(MI, BB,
8633                                                X86::ADD32rr, X86::ADC32rr,
8634                                                X86::ADD32ri, X86::ADC32ri,
8635                                                false);
8636   case X86::ATOMSUB6432:
8637     return EmitAtomicBit6432WithCustomInserter(MI, BB,
8638                                                X86::SUB32rr, X86::SBB32rr,
8639                                                X86::SUB32ri, X86::SBB32ri,
8640                                                false);
8641   case X86::ATOMSWAP6432:
8642     return EmitAtomicBit6432WithCustomInserter(MI, BB,
8643                                                X86::MOV32rr, X86::MOV32rr,
8644                                                X86::MOV32ri, X86::MOV32ri,
8645                                                false);
8646   case X86::VASTART_SAVE_XMM_REGS:
8647     return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB);
8648   }
8649 }
8650
8651 //===----------------------------------------------------------------------===//
8652 //                           X86 Optimization Hooks
8653 //===----------------------------------------------------------------------===//
8654
8655 void X86TargetLowering::computeMaskedBitsForTargetNode(const SDValue Op,
8656                                                        const APInt &Mask,
8657                                                        APInt &KnownZero,
8658                                                        APInt &KnownOne,
8659                                                        const SelectionDAG &DAG,
8660                                                        unsigned Depth) const {
8661   unsigned Opc = Op.getOpcode();
8662   assert((Opc >= ISD::BUILTIN_OP_END ||
8663           Opc == ISD::INTRINSIC_WO_CHAIN ||
8664           Opc == ISD::INTRINSIC_W_CHAIN ||
8665           Opc == ISD::INTRINSIC_VOID) &&
8666          "Should use MaskedValueIsZero if you don't know whether Op"
8667          " is a target node!");
8668
8669   KnownZero = KnownOne = APInt(Mask.getBitWidth(), 0);   // Don't know anything.
8670   switch (Opc) {
8671   default: break;
8672   case X86ISD::ADD:
8673   case X86ISD::SUB:
8674   case X86ISD::SMUL:
8675   case X86ISD::UMUL:
8676   case X86ISD::INC:
8677   case X86ISD::DEC:
8678   case X86ISD::OR:
8679   case X86ISD::XOR:
8680   case X86ISD::AND:
8681     // These nodes' second result is a boolean.
8682     if (Op.getResNo() == 0)
8683       break;
8684     // Fallthrough
8685   case X86ISD::SETCC:
8686     KnownZero |= APInt::getHighBitsSet(Mask.getBitWidth(),
8687                                        Mask.getBitWidth() - 1);
8688     break;
8689   }
8690 }
8691
8692 /// isGAPlusOffset - Returns true (and the GlobalValue and the offset) if the
8693 /// node is a GlobalAddress + offset.
8694 bool X86TargetLowering::isGAPlusOffset(SDNode *N,
8695                                        GlobalValue* &GA, int64_t &Offset) const{
8696   if (N->getOpcode() == X86ISD::Wrapper) {
8697     if (isa<GlobalAddressSDNode>(N->getOperand(0))) {
8698       GA = cast<GlobalAddressSDNode>(N->getOperand(0))->getGlobal();
8699       Offset = cast<GlobalAddressSDNode>(N->getOperand(0))->getOffset();
8700       return true;
8701     }
8702   }
8703   return TargetLowering::isGAPlusOffset(N, GA, Offset);
8704 }
8705
8706 static bool EltsFromConsecutiveLoads(ShuffleVectorSDNode *N, unsigned NumElems,
8707                                      EVT EltVT, LoadSDNode *&LDBase,
8708                                      unsigned &LastLoadedElt,
8709                                      SelectionDAG &DAG, MachineFrameInfo *MFI,
8710                                      const TargetLowering &TLI) {
8711   LDBase = NULL;
8712   LastLoadedElt = -1U;
8713   for (unsigned i = 0; i < NumElems; ++i) {
8714     if (N->getMaskElt(i) < 0) {
8715       if (!LDBase)
8716         return false;
8717       continue;
8718     }
8719
8720     SDValue Elt = DAG.getShuffleScalarElt(N, i);
8721     if (!Elt.getNode() ||
8722         (Elt.getOpcode() != ISD::UNDEF && !ISD::isNON_EXTLoad(Elt.getNode())))
8723       return false;
8724     if (!LDBase) {
8725       if (Elt.getNode()->getOpcode() == ISD::UNDEF)
8726         return false;
8727       LDBase = cast<LoadSDNode>(Elt.getNode());
8728       LastLoadedElt = i;
8729       continue;
8730     }
8731     if (Elt.getOpcode() == ISD::UNDEF)
8732       continue;
8733
8734     LoadSDNode *LD = cast<LoadSDNode>(Elt);
8735     if (!DAG.isConsecutiveLoad(LD, LDBase, EltVT.getSizeInBits()/8, i))
8736       return false;
8737     LastLoadedElt = i;
8738   }
8739   return true;
8740 }
8741
8742 /// PerformShuffleCombine - Combine a vector_shuffle that is equal to
8743 /// build_vector load1, load2, load3, load4, <0, 1, 2, 3> into a 128-bit load
8744 /// if the load addresses are consecutive, non-overlapping, and in the right
8745 /// order.  In the case of v2i64, it will see if it can rewrite the
8746 /// shuffle to be an appropriate build vector so it can take advantage of
8747 // performBuildVectorCombine.
8748 static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG,
8749                                      const TargetLowering &TLI) {
8750   DebugLoc dl = N->getDebugLoc();
8751   EVT VT = N->getValueType(0);
8752   EVT EltVT = VT.getVectorElementType();
8753   ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(N);
8754   unsigned NumElems = VT.getVectorNumElements();
8755
8756   if (VT.getSizeInBits() != 128)
8757     return SDValue();
8758
8759   // Try to combine a vector_shuffle into a 128-bit load.
8760   MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
8761   LoadSDNode *LD = NULL;
8762   unsigned LastLoadedElt;
8763   if (!EltsFromConsecutiveLoads(SVN, NumElems, EltVT, LD, LastLoadedElt, DAG,
8764                                 MFI, TLI))
8765     return SDValue();
8766
8767   if (LastLoadedElt == NumElems - 1) {
8768     if (DAG.InferPtrAlignment(LD->getBasePtr()) >= 16)
8769       return DAG.getLoad(VT, dl, LD->getChain(), LD->getBasePtr(),
8770                          LD->getSrcValue(), LD->getSrcValueOffset(),
8771                          LD->isVolatile(), LD->isNonTemporal(), 0);
8772     return DAG.getLoad(VT, dl, LD->getChain(), LD->getBasePtr(),
8773                        LD->getSrcValue(), LD->getSrcValueOffset(),
8774                        LD->isVolatile(), LD->isNonTemporal(),
8775                        LD->getAlignment());
8776   } else if (NumElems == 4 && LastLoadedElt == 1) {
8777     SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other);
8778     SDValue Ops[] = { LD->getChain(), LD->getBasePtr() };
8779     SDValue ResNode = DAG.getNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops, 2);
8780     return DAG.getNode(ISD::BIT_CONVERT, dl, VT, ResNode);
8781   }
8782   return SDValue();
8783 }
8784
8785 /// PerformSELECTCombine - Do target-specific dag combines on SELECT nodes.
8786 static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
8787                                     const X86Subtarget *Subtarget) {
8788   DebugLoc DL = N->getDebugLoc();
8789   SDValue Cond = N->getOperand(0);
8790   // Get the LHS/RHS of the select.
8791   SDValue LHS = N->getOperand(1);
8792   SDValue RHS = N->getOperand(2);
8793
8794   // If we have SSE[12] support, try to form min/max nodes. SSE min/max
8795   // instructions have the peculiarity that if either operand is a NaN,
8796   // they chose what we call the RHS operand (and as such are not symmetric).
8797   // It happens that this matches the semantics of the common C idiom
8798   // x<y?x:y and related forms, so we can recognize these cases.
8799   if (Subtarget->hasSSE2() &&
8800       (LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64) &&
8801       Cond.getOpcode() == ISD::SETCC) {
8802     ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
8803
8804     unsigned Opcode = 0;
8805     // Check for x CC y ? x : y.
8806     if (LHS == Cond.getOperand(0) && RHS == Cond.getOperand(1)) {
8807       switch (CC) {
8808       default: break;
8809       case ISD::SETULT:
8810         // This can be a min if we can prove that at least one of the operands
8811         // is not a nan.
8812         if (!FiniteOnlyFPMath()) {
8813           if (DAG.isKnownNeverNaN(RHS)) {
8814             // Put the potential NaN in the RHS so that SSE will preserve it.
8815             std::swap(LHS, RHS);
8816           } else if (!DAG.isKnownNeverNaN(LHS))
8817             break;
8818         }
8819         Opcode = X86ISD::FMIN;
8820         break;
8821       case ISD::SETOLE:
8822         // This can be a min if we can prove that at least one of the operands
8823         // is not a nan.
8824         if (!FiniteOnlyFPMath()) {
8825           if (DAG.isKnownNeverNaN(LHS)) {
8826             // Put the potential NaN in the RHS so that SSE will preserve it.
8827             std::swap(LHS, RHS);
8828           } else if (!DAG.isKnownNeverNaN(RHS))
8829             break;
8830         }
8831         Opcode = X86ISD::FMIN;
8832         break;
8833       case ISD::SETULE:
8834         // This can be a min, but if either operand is a NaN we need it to
8835         // preserve the original LHS.
8836         std::swap(LHS, RHS);
8837       case ISD::SETOLT:
8838       case ISD::SETLT:
8839       case ISD::SETLE:
8840         Opcode = X86ISD::FMIN;
8841         break;
8842
8843       case ISD::SETOGE:
8844         // This can be a max if we can prove that at least one of the operands
8845         // is not a nan.
8846         if (!FiniteOnlyFPMath()) {
8847           if (DAG.isKnownNeverNaN(LHS)) {
8848             // Put the potential NaN in the RHS so that SSE will preserve it.
8849             std::swap(LHS, RHS);
8850           } else if (!DAG.isKnownNeverNaN(RHS))
8851             break;
8852         }
8853         Opcode = X86ISD::FMAX;
8854         break;
8855       case ISD::SETUGT:
8856         // This can be a max if we can prove that at least one of the operands
8857         // is not a nan.
8858         if (!FiniteOnlyFPMath()) {
8859           if (DAG.isKnownNeverNaN(RHS)) {
8860             // Put the potential NaN in the RHS so that SSE will preserve it.
8861             std::swap(LHS, RHS);
8862           } else if (!DAG.isKnownNeverNaN(LHS))
8863             break;
8864         }
8865         Opcode = X86ISD::FMAX;
8866         break;
8867       case ISD::SETUGE:
8868         // This can be a max, but if either operand is a NaN we need it to
8869         // preserve the original LHS.
8870         std::swap(LHS, RHS);
8871       case ISD::SETOGT:
8872       case ISD::SETGT:
8873       case ISD::SETGE:
8874         Opcode = X86ISD::FMAX;
8875         break;
8876       }
8877     // Check for x CC y ? y : x -- a min/max with reversed arms.
8878     } else if (LHS == Cond.getOperand(1) && RHS == Cond.getOperand(0)) {
8879       switch (CC) {
8880       default: break;
8881       case ISD::SETOGE:
8882         // This can be a min if we can prove that at least one of the operands
8883         // is not a nan.
8884         if (!FiniteOnlyFPMath()) {
8885           if (DAG.isKnownNeverNaN(RHS)) {
8886             // Put the potential NaN in the RHS so that SSE will preserve it.
8887             std::swap(LHS, RHS);
8888           } else if (!DAG.isKnownNeverNaN(LHS))
8889             break;
8890         }
8891         Opcode = X86ISD::FMIN;
8892         break;
8893       case ISD::SETUGT:
8894         // This can be a min if we can prove that at least one of the operands
8895         // is not a nan.
8896         if (!FiniteOnlyFPMath()) {
8897           if (DAG.isKnownNeverNaN(LHS)) {
8898             // Put the potential NaN in the RHS so that SSE will preserve it.
8899             std::swap(LHS, RHS);
8900           } else if (!DAG.isKnownNeverNaN(RHS))
8901             break;
8902         }
8903         Opcode = X86ISD::FMIN;
8904         break;
8905       case ISD::SETUGE:
8906         // This can be a min, but if either operand is a NaN we need it to
8907         // preserve the original LHS.
8908         std::swap(LHS, RHS);
8909       case ISD::SETOGT:
8910       case ISD::SETGT:
8911       case ISD::SETGE:
8912         Opcode = X86ISD::FMIN;
8913         break;
8914
8915       case ISD::SETULT:
8916         // This can be a max if we can prove that at least one of the operands
8917         // is not a nan.
8918         if (!FiniteOnlyFPMath()) {
8919           if (DAG.isKnownNeverNaN(LHS)) {
8920             // Put the potential NaN in the RHS so that SSE will preserve it.
8921             std::swap(LHS, RHS);
8922           } else if (!DAG.isKnownNeverNaN(RHS))
8923             break;
8924         }
8925         Opcode = X86ISD::FMAX;
8926         break;
8927       case ISD::SETOLE:
8928         // This can be a max if we can prove that at least one of the operands
8929         // is not a nan.
8930         if (!FiniteOnlyFPMath()) {
8931           if (DAG.isKnownNeverNaN(RHS)) {
8932             // Put the potential NaN in the RHS so that SSE will preserve it.
8933             std::swap(LHS, RHS);
8934           } else if (!DAG.isKnownNeverNaN(LHS))
8935             break;
8936         }
8937         Opcode = X86ISD::FMAX;
8938         break;
8939       case ISD::SETULE:
8940         // This can be a max, but if either operand is a NaN we need it to
8941         // preserve the original LHS.
8942         std::swap(LHS, RHS);
8943       case ISD::SETOLT:
8944       case ISD::SETLT:
8945       case ISD::SETLE:
8946         Opcode = X86ISD::FMAX;
8947         break;
8948       }
8949     }
8950
8951     if (Opcode)
8952       return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS);
8953   }
8954
8955   // If this is a select between two integer constants, try to do some
8956   // optimizations.
8957   if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(LHS)) {
8958     if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(RHS))
8959       // Don't do this for crazy integer types.
8960       if (DAG.getTargetLoweringInfo().isTypeLegal(LHS.getValueType())) {
8961         // If this is efficiently invertible, canonicalize the LHSC/RHSC values
8962         // so that TrueC (the true value) is larger than FalseC.
8963         bool NeedsCondInvert = false;
8964
8965         if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue()) &&
8966             // Efficiently invertible.
8967             (Cond.getOpcode() == ISD::SETCC ||  // setcc -> invertible.
8968              (Cond.getOpcode() == ISD::XOR &&   // xor(X, C) -> invertible.
8969               isa<ConstantSDNode>(Cond.getOperand(1))))) {
8970           NeedsCondInvert = true;
8971           std::swap(TrueC, FalseC);
8972         }
8973
8974         // Optimize C ? 8 : 0 -> zext(C) << 3.  Likewise for any pow2/0.
8975         if (FalseC->getAPIntValue() == 0 &&
8976             TrueC->getAPIntValue().isPowerOf2()) {
8977           if (NeedsCondInvert) // Invert the condition if needed.
8978             Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
8979                                DAG.getConstant(1, Cond.getValueType()));
8980
8981           // Zero extend the condition if needed.
8982           Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, LHS.getValueType(), Cond);
8983
8984           unsigned ShAmt = TrueC->getAPIntValue().logBase2();
8985           return DAG.getNode(ISD::SHL, DL, LHS.getValueType(), Cond,
8986                              DAG.getConstant(ShAmt, MVT::i8));
8987         }
8988
8989         // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst.
8990         if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
8991           if (NeedsCondInvert) // Invert the condition if needed.
8992             Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
8993                                DAG.getConstant(1, Cond.getValueType()));
8994
8995           // Zero extend the condition if needed.
8996           Cond = DAG.getNode(ISD::ZERO_EXTEND, DL,
8997                              FalseC->getValueType(0), Cond);
8998           return DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
8999                              SDValue(FalseC, 0));
9000         }
9001
9002         // Optimize cases that will turn into an LEA instruction.  This requires
9003         // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
9004         if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
9005           uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue();
9006           if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff;
9007
9008           bool isFastMultiplier = false;
9009           if (Diff < 10) {
9010             switch ((unsigned char)Diff) {
9011               default: break;
9012               case 1:  // result = add base, cond
9013               case 2:  // result = lea base(    , cond*2)
9014               case 3:  // result = lea base(cond, cond*2)
9015               case 4:  // result = lea base(    , cond*4)
9016               case 5:  // result = lea base(cond, cond*4)
9017               case 8:  // result = lea base(    , cond*8)
9018               case 9:  // result = lea base(cond, cond*8)
9019                 isFastMultiplier = true;
9020                 break;
9021             }
9022           }
9023
9024           if (isFastMultiplier) {
9025             APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue();
9026             if (NeedsCondInvert) // Invert the condition if needed.
9027               Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
9028                                  DAG.getConstant(1, Cond.getValueType()));
9029
9030             // Zero extend the condition if needed.
9031             Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
9032                                Cond);
9033             // Scale the condition by the difference.
9034             if (Diff != 1)
9035               Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
9036                                  DAG.getConstant(Diff, Cond.getValueType()));
9037
9038             // Add the base if non-zero.
9039             if (FalseC->getAPIntValue() != 0)
9040               Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
9041                                  SDValue(FalseC, 0));
9042             return Cond;
9043           }
9044         }
9045       }
9046   }
9047
9048   return SDValue();
9049 }
9050
9051 /// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL]
9052 static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG,
9053                                   TargetLowering::DAGCombinerInfo &DCI) {
9054   DebugLoc DL = N->getDebugLoc();
9055
9056   // If the flag operand isn't dead, don't touch this CMOV.
9057   if (N->getNumValues() == 2 && !SDValue(N, 1).use_empty())
9058     return SDValue();
9059
9060   // If this is a select between two integer constants, try to do some
9061   // optimizations.  Note that the operands are ordered the opposite of SELECT
9062   // operands.
9063   if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
9064     if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(N->getOperand(0))) {
9065       // Canonicalize the TrueC/FalseC values so that TrueC (the true value) is
9066       // larger than FalseC (the false value).
9067       X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2);
9068
9069       if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) {
9070         CC = X86::GetOppositeBranchCondition(CC);
9071         std::swap(TrueC, FalseC);
9072       }
9073
9074       // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3.  Likewise for any pow2/0.
9075       // This is efficient for any integer data type (including i8/i16) and
9076       // shift amount.
9077       if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) {
9078         SDValue Cond = N->getOperand(3);
9079         Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
9080                            DAG.getConstant(CC, MVT::i8), Cond);
9081
9082         // Zero extend the condition if needed.
9083         Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond);
9084
9085         unsigned ShAmt = TrueC->getAPIntValue().logBase2();
9086         Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond,
9087                            DAG.getConstant(ShAmt, MVT::i8));
9088         if (N->getNumValues() == 2)  // Dead flag value?
9089           return DCI.CombineTo(N, Cond, SDValue());
9090         return Cond;
9091       }
9092
9093       // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst.  This is efficient
9094       // for any integer data type, including i8/i16.
9095       if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
9096         SDValue Cond = N->getOperand(3);
9097         Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
9098                            DAG.getConstant(CC, MVT::i8), Cond);
9099
9100         // Zero extend the condition if needed.
9101         Cond = DAG.getNode(ISD::ZERO_EXTEND, DL,
9102                            FalseC->getValueType(0), Cond);
9103         Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
9104                            SDValue(FalseC, 0));
9105
9106         if (N->getNumValues() == 2)  // Dead flag value?
9107           return DCI.CombineTo(N, Cond, SDValue());
9108         return Cond;
9109       }
9110
9111       // Optimize cases that will turn into an LEA instruction.  This requires
9112       // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
9113       if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
9114         uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue();
9115         if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff;
9116
9117         bool isFastMultiplier = false;
9118         if (Diff < 10) {
9119           switch ((unsigned char)Diff) {
9120           default: break;
9121           case 1:  // result = add base, cond
9122           case 2:  // result = lea base(    , cond*2)
9123           case 3:  // result = lea base(cond, cond*2)
9124           case 4:  // result = lea base(    , cond*4)
9125           case 5:  // result = lea base(cond, cond*4)
9126           case 8:  // result = lea base(    , cond*8)
9127           case 9:  // result = lea base(cond, cond*8)
9128             isFastMultiplier = true;
9129             break;
9130           }
9131         }
9132
9133         if (isFastMultiplier) {
9134           APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue();
9135           SDValue Cond = N->getOperand(3);
9136           Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
9137                              DAG.getConstant(CC, MVT::i8), Cond);
9138           // Zero extend the condition if needed.
9139           Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
9140                              Cond);
9141           // Scale the condition by the difference.
9142           if (Diff != 1)
9143             Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
9144                                DAG.getConstant(Diff, Cond.getValueType()));
9145
9146           // Add the base if non-zero.
9147           if (FalseC->getAPIntValue() != 0)
9148             Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
9149                                SDValue(FalseC, 0));
9150           if (N->getNumValues() == 2)  // Dead flag value?
9151             return DCI.CombineTo(N, Cond, SDValue());
9152           return Cond;
9153         }
9154       }
9155     }
9156   }
9157   return SDValue();
9158 }
9159
9160
9161 /// PerformMulCombine - Optimize a single multiply with constant into two
9162 /// in order to implement it with two cheaper instructions, e.g.
9163 /// LEA + SHL, LEA + LEA.
9164 static SDValue PerformMulCombine(SDNode *N, SelectionDAG &DAG,
9165                                  TargetLowering::DAGCombinerInfo &DCI) {
9166   if (DAG.getMachineFunction().
9167       getFunction()->hasFnAttr(Attribute::OptimizeForSize))
9168     return SDValue();
9169
9170   if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
9171     return SDValue();
9172
9173   EVT VT = N->getValueType(0);
9174   if (VT != MVT::i64)
9175     return SDValue();
9176
9177   ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
9178   if (!C)
9179     return SDValue();
9180   uint64_t MulAmt = C->getZExtValue();
9181   if (isPowerOf2_64(MulAmt) || MulAmt == 3 || MulAmt == 5 || MulAmt == 9)
9182     return SDValue();
9183
9184   uint64_t MulAmt1 = 0;
9185   uint64_t MulAmt2 = 0;
9186   if ((MulAmt % 9) == 0) {
9187     MulAmt1 = 9;
9188     MulAmt2 = MulAmt / 9;
9189   } else if ((MulAmt % 5) == 0) {
9190     MulAmt1 = 5;
9191     MulAmt2 = MulAmt / 5;
9192   } else if ((MulAmt % 3) == 0) {
9193     MulAmt1 = 3;
9194     MulAmt2 = MulAmt / 3;
9195   }
9196   if (MulAmt2 &&
9197       (isPowerOf2_64(MulAmt2) || MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)){
9198     DebugLoc DL = N->getDebugLoc();
9199
9200     if (isPowerOf2_64(MulAmt2) &&
9201         !(N->hasOneUse() && N->use_begin()->getOpcode() == ISD::ADD))
9202       // If second multiplifer is pow2, issue it first. We want the multiply by
9203       // 3, 5, or 9 to be folded into the addressing mode unless the lone use
9204       // is an add.
9205       std::swap(MulAmt1, MulAmt2);
9206
9207     SDValue NewMul;
9208     if (isPowerOf2_64(MulAmt1))
9209       NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
9210                            DAG.getConstant(Log2_64(MulAmt1), MVT::i8));
9211     else
9212       NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
9213                            DAG.getConstant(MulAmt1, VT));
9214
9215     if (isPowerOf2_64(MulAmt2))
9216       NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul,
9217                            DAG.getConstant(Log2_64(MulAmt2), MVT::i8));
9218     else
9219       NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul,
9220                            DAG.getConstant(MulAmt2, VT));
9221
9222     // Do not add new nodes to DAG combiner worklist.
9223     DCI.CombineTo(N, NewMul, false);
9224   }
9225   return SDValue();
9226 }
9227
9228 static SDValue PerformSHLCombine(SDNode *N, SelectionDAG &DAG) {
9229   SDValue N0 = N->getOperand(0);
9230   SDValue N1 = N->getOperand(1);
9231   ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
9232   EVT VT = N0.getValueType();
9233
9234   // fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2))
9235   // since the result of setcc_c is all zero's or all ones.
9236   if (N1C && N0.getOpcode() == ISD::AND &&
9237       N0.getOperand(1).getOpcode() == ISD::Constant) {
9238     SDValue N00 = N0.getOperand(0);
9239     if (N00.getOpcode() == X86ISD::SETCC_CARRY ||
9240         ((N00.getOpcode() == ISD::ANY_EXTEND ||
9241           N00.getOpcode() == ISD::ZERO_EXTEND) &&
9242          N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY)) {
9243       APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue();
9244       APInt ShAmt = N1C->getAPIntValue();
9245       Mask = Mask.shl(ShAmt);
9246       if (Mask != 0)
9247         return DAG.getNode(ISD::AND, N->getDebugLoc(), VT,
9248                            N00, DAG.getConstant(Mask, VT));
9249     }
9250   }
9251
9252   return SDValue();
9253 }
9254
9255 /// PerformShiftCombine - Transforms vector shift nodes to use vector shifts
9256 ///                       when possible.
9257 static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG,
9258                                    const X86Subtarget *Subtarget) {
9259   EVT VT = N->getValueType(0);
9260   if (!VT.isVector() && VT.isInteger() &&
9261       N->getOpcode() == ISD::SHL)
9262     return PerformSHLCombine(N, DAG);
9263
9264   // On X86 with SSE2 support, we can transform this to a vector shift if
9265   // all elements are shifted by the same amount.  We can't do this in legalize
9266   // because the a constant vector is typically transformed to a constant pool
9267   // so we have no knowledge of the shift amount.
9268   if (!Subtarget->hasSSE2())
9269     return SDValue();
9270
9271   if (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16)
9272     return SDValue();
9273
9274   SDValue ShAmtOp = N->getOperand(1);
9275   EVT EltVT = VT.getVectorElementType();
9276   DebugLoc DL = N->getDebugLoc();
9277   SDValue BaseShAmt = SDValue();
9278   if (ShAmtOp.getOpcode() == ISD::BUILD_VECTOR) {
9279     unsigned NumElts = VT.getVectorNumElements();
9280     unsigned i = 0;
9281     for (; i != NumElts; ++i) {
9282       SDValue Arg = ShAmtOp.getOperand(i);
9283       if (Arg.getOpcode() == ISD::UNDEF) continue;
9284       BaseShAmt = Arg;
9285       break;
9286     }
9287     for (; i != NumElts; ++i) {
9288       SDValue Arg = ShAmtOp.getOperand(i);
9289       if (Arg.getOpcode() == ISD::UNDEF) continue;
9290       if (Arg != BaseShAmt) {
9291         return SDValue();
9292       }
9293     }
9294   } else if (ShAmtOp.getOpcode() == ISD::VECTOR_SHUFFLE &&
9295              cast<ShuffleVectorSDNode>(ShAmtOp)->isSplat()) {
9296     SDValue InVec = ShAmtOp.getOperand(0);
9297     if (InVec.getOpcode() == ISD::BUILD_VECTOR) {
9298       unsigned NumElts = InVec.getValueType().getVectorNumElements();
9299       unsigned i = 0;
9300       for (; i != NumElts; ++i) {
9301         SDValue Arg = InVec.getOperand(i);
9302         if (Arg.getOpcode() == ISD::UNDEF) continue;
9303         BaseShAmt = Arg;
9304         break;
9305       }
9306     } else if (InVec.getOpcode() == ISD::INSERT_VECTOR_ELT) {
9307        if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(InVec.getOperand(2))) {
9308          unsigned SplatIdx = cast<ShuffleVectorSDNode>(ShAmtOp)->getSplatIndex();
9309          if (C->getZExtValue() == SplatIdx)
9310            BaseShAmt = InVec.getOperand(1);
9311        }
9312     }
9313     if (BaseShAmt.getNode() == 0)
9314       BaseShAmt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, ShAmtOp,
9315                               DAG.getIntPtrConstant(0));
9316   } else
9317     return SDValue();
9318
9319   // The shift amount is an i32.
9320   if (EltVT.bitsGT(MVT::i32))
9321     BaseShAmt = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, BaseShAmt);
9322   else if (EltVT.bitsLT(MVT::i32))
9323     BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, BaseShAmt);
9324
9325   // The shift amount is identical so we can do a vector shift.
9326   SDValue  ValOp = N->getOperand(0);
9327   switch (N->getOpcode()) {
9328   default:
9329     llvm_unreachable("Unknown shift opcode!");
9330     break;
9331   case ISD::SHL:
9332     if (VT == MVT::v2i64)
9333       return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
9334                          DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32),
9335                          ValOp, BaseShAmt);
9336     if (VT == MVT::v4i32)
9337       return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
9338                          DAG.getConstant(Intrinsic::x86_sse2_pslli_d, MVT::i32),
9339                          ValOp, BaseShAmt);
9340     if (VT == MVT::v8i16)
9341       return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
9342                          DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32),
9343                          ValOp, BaseShAmt);
9344     break;
9345   case ISD::SRA:
9346     if (VT == MVT::v4i32)
9347       return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
9348                          DAG.getConstant(Intrinsic::x86_sse2_psrai_d, MVT::i32),
9349                          ValOp, BaseShAmt);
9350     if (VT == MVT::v8i16)
9351       return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
9352                          DAG.getConstant(Intrinsic::x86_sse2_psrai_w, MVT::i32),
9353                          ValOp, BaseShAmt);
9354     break;
9355   case ISD::SRL:
9356     if (VT == MVT::v2i64)
9357       return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
9358                          DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32),
9359                          ValOp, BaseShAmt);
9360     if (VT == MVT::v4i32)
9361       return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
9362                          DAG.getConstant(Intrinsic::x86_sse2_psrli_d, MVT::i32),
9363                          ValOp, BaseShAmt);
9364     if (VT ==  MVT::v8i16)
9365       return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
9366                          DAG.getConstant(Intrinsic::x86_sse2_psrli_w, MVT::i32),
9367                          ValOp, BaseShAmt);
9368     break;
9369   }
9370   return SDValue();
9371 }
9372
9373 static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG,
9374                                 const X86Subtarget *Subtarget) {
9375   EVT VT = N->getValueType(0);
9376   if (VT != MVT::i64 || !Subtarget->is64Bit())
9377     return SDValue();
9378
9379   // fold (or (x << c) | (y >> (64 - c))) ==> (shld64 x, y, c)
9380   SDValue N0 = N->getOperand(0);
9381   SDValue N1 = N->getOperand(1);
9382   if (N0.getOpcode() == ISD::SRL && N1.getOpcode() == ISD::SHL)
9383     std::swap(N0, N1);
9384   if (N0.getOpcode() != ISD::SHL || N1.getOpcode() != ISD::SRL)
9385     return SDValue();
9386
9387   SDValue ShAmt0 = N0.getOperand(1);
9388   if (ShAmt0.getValueType() != MVT::i8)
9389     return SDValue();
9390   SDValue ShAmt1 = N1.getOperand(1);
9391   if (ShAmt1.getValueType() != MVT::i8)
9392     return SDValue();
9393   if (ShAmt0.getOpcode() == ISD::TRUNCATE)
9394     ShAmt0 = ShAmt0.getOperand(0);
9395   if (ShAmt1.getOpcode() == ISD::TRUNCATE)
9396     ShAmt1 = ShAmt1.getOperand(0);
9397
9398   DebugLoc DL = N->getDebugLoc();
9399   unsigned Opc = X86ISD::SHLD;
9400   SDValue Op0 = N0.getOperand(0);
9401   SDValue Op1 = N1.getOperand(0);
9402   if (ShAmt0.getOpcode() == ISD::SUB) {
9403     Opc = X86ISD::SHRD;
9404     std::swap(Op0, Op1);
9405     std::swap(ShAmt0, ShAmt1);
9406   }
9407
9408   if (ShAmt1.getOpcode() == ISD::SUB) {
9409     SDValue Sum = ShAmt1.getOperand(0);
9410     if (ConstantSDNode *SumC = dyn_cast<ConstantSDNode>(Sum)) {
9411       if (SumC->getSExtValue() == 64 &&
9412           ShAmt1.getOperand(1) == ShAmt0)
9413         return DAG.getNode(Opc, DL, VT,
9414                            Op0, Op1,
9415                            DAG.getNode(ISD::TRUNCATE, DL,
9416                                        MVT::i8, ShAmt0));
9417     }
9418   } else if (ConstantSDNode *ShAmt1C = dyn_cast<ConstantSDNode>(ShAmt1)) {
9419     ConstantSDNode *ShAmt0C = dyn_cast<ConstantSDNode>(ShAmt0);
9420     if (ShAmt0C &&
9421         ShAmt0C->getSExtValue() + ShAmt1C->getSExtValue() == 64)
9422       return DAG.getNode(Opc, DL, VT,
9423                          N0.getOperand(0), N1.getOperand(0),
9424                          DAG.getNode(ISD::TRUNCATE, DL,
9425                                        MVT::i8, ShAmt0));
9426   }
9427
9428   return SDValue();
9429 }
9430
9431 /// PerformSTORECombine - Do target-specific dag combines on STORE nodes.
9432 static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
9433                                    const X86Subtarget *Subtarget) {
9434   // Turn load->store of MMX types into GPR load/stores.  This avoids clobbering
9435   // the FP state in cases where an emms may be missing.
9436   // A preferable solution to the general problem is to figure out the right
9437   // places to insert EMMS.  This qualifies as a quick hack.
9438
9439   // Similarly, turn load->store of i64 into double load/stores in 32-bit mode.
9440   StoreSDNode *St = cast<StoreSDNode>(N);
9441   EVT VT = St->getValue().getValueType();
9442   if (VT.getSizeInBits() != 64)
9443     return SDValue();
9444
9445   const Function *F = DAG.getMachineFunction().getFunction();
9446   bool NoImplicitFloatOps = F->hasFnAttr(Attribute::NoImplicitFloat);
9447   bool F64IsLegal = !UseSoftFloat && !NoImplicitFloatOps
9448     && Subtarget->hasSSE2();
9449   if ((VT.isVector() ||
9450        (VT == MVT::i64 && F64IsLegal && !Subtarget->is64Bit())) &&
9451       isa<LoadSDNode>(St->getValue()) &&
9452       !cast<LoadSDNode>(St->getValue())->isVolatile() &&
9453       St->getChain().hasOneUse() && !St->isVolatile()) {
9454     SDNode* LdVal = St->getValue().getNode();
9455     LoadSDNode *Ld = 0;
9456     int TokenFactorIndex = -1;
9457     SmallVector<SDValue, 8> Ops;
9458     SDNode* ChainVal = St->getChain().getNode();
9459     // Must be a store of a load.  We currently handle two cases:  the load
9460     // is a direct child, and it's under an intervening TokenFactor.  It is
9461     // possible to dig deeper under nested TokenFactors.
9462     if (ChainVal == LdVal)
9463       Ld = cast<LoadSDNode>(St->getChain());
9464     else if (St->getValue().hasOneUse() &&
9465              ChainVal->getOpcode() == ISD::TokenFactor) {
9466       for (unsigned i=0, e = ChainVal->getNumOperands(); i != e; ++i) {
9467         if (ChainVal->getOperand(i).getNode() == LdVal) {
9468           TokenFactorIndex = i;
9469           Ld = cast<LoadSDNode>(St->getValue());
9470         } else
9471           Ops.push_back(ChainVal->getOperand(i));
9472       }
9473     }
9474
9475     if (!Ld || !ISD::isNormalLoad(Ld))
9476       return SDValue();
9477
9478     // If this is not the MMX case, i.e. we are just turning i64 load/store
9479     // into f64 load/store, avoid the transformation if there are multiple
9480     // uses of the loaded value.
9481     if (!VT.isVector() && !Ld->hasNUsesOfValue(1, 0))
9482       return SDValue();
9483
9484     DebugLoc LdDL = Ld->getDebugLoc();
9485     DebugLoc StDL = N->getDebugLoc();
9486     // If we are a 64-bit capable x86, lower to a single movq load/store pair.
9487     // Otherwise, if it's legal to use f64 SSE instructions, use f64 load/store
9488     // pair instead.
9489     if (Subtarget->is64Bit() || F64IsLegal) {
9490       EVT LdVT = Subtarget->is64Bit() ? MVT::i64 : MVT::f64;
9491       SDValue NewLd = DAG.getLoad(LdVT, LdDL, Ld->getChain(),
9492                                   Ld->getBasePtr(), Ld->getSrcValue(),
9493                                   Ld->getSrcValueOffset(), Ld->isVolatile(),
9494                                   Ld->isNonTemporal(), Ld->getAlignment());
9495       SDValue NewChain = NewLd.getValue(1);
9496       if (TokenFactorIndex != -1) {
9497         Ops.push_back(NewChain);
9498         NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, &Ops[0],
9499                                Ops.size());
9500       }
9501       return DAG.getStore(NewChain, StDL, NewLd, St->getBasePtr(),
9502                           St->getSrcValue(), St->getSrcValueOffset(),
9503                           St->isVolatile(), St->isNonTemporal(),
9504                           St->getAlignment());
9505     }
9506
9507     // Otherwise, lower to two pairs of 32-bit loads / stores.
9508     SDValue LoAddr = Ld->getBasePtr();
9509     SDValue HiAddr = DAG.getNode(ISD::ADD, LdDL, MVT::i32, LoAddr,
9510                                  DAG.getConstant(4, MVT::i32));
9511
9512     SDValue LoLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), LoAddr,
9513                                Ld->getSrcValue(), Ld->getSrcValueOffset(),
9514                                Ld->isVolatile(), Ld->isNonTemporal(),
9515                                Ld->getAlignment());
9516     SDValue HiLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), HiAddr,
9517                                Ld->getSrcValue(), Ld->getSrcValueOffset()+4,
9518                                Ld->isVolatile(), Ld->isNonTemporal(),
9519                                MinAlign(Ld->getAlignment(), 4));
9520
9521     SDValue NewChain = LoLd.getValue(1);
9522     if (TokenFactorIndex != -1) {
9523       Ops.push_back(LoLd);
9524       Ops.push_back(HiLd);
9525       NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, &Ops[0],
9526                              Ops.size());
9527     }
9528
9529     LoAddr = St->getBasePtr();
9530     HiAddr = DAG.getNode(ISD::ADD, StDL, MVT::i32, LoAddr,
9531                          DAG.getConstant(4, MVT::i32));
9532
9533     SDValue LoSt = DAG.getStore(NewChain, StDL, LoLd, LoAddr,
9534                                 St->getSrcValue(), St->getSrcValueOffset(),
9535                                 St->isVolatile(), St->isNonTemporal(),
9536                                 St->getAlignment());
9537     SDValue HiSt = DAG.getStore(NewChain, StDL, HiLd, HiAddr,
9538                                 St->getSrcValue(),
9539                                 St->getSrcValueOffset() + 4,
9540                                 St->isVolatile(),
9541                                 St->isNonTemporal(),
9542                                 MinAlign(St->getAlignment(), 4));
9543     return DAG.getNode(ISD::TokenFactor, StDL, MVT::Other, LoSt, HiSt);
9544   }
9545   return SDValue();
9546 }
9547
9548 /// PerformFORCombine - Do target-specific dag combines on X86ISD::FOR and
9549 /// X86ISD::FXOR nodes.
9550 static SDValue PerformFORCombine(SDNode *N, SelectionDAG &DAG) {
9551   assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR);
9552   // F[X]OR(0.0, x) -> x
9553   // F[X]OR(x, 0.0) -> x
9554   if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0)))
9555     if (C->getValueAPF().isPosZero())
9556       return N->getOperand(1);
9557   if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1)))
9558     if (C->getValueAPF().isPosZero())
9559       return N->getOperand(0);
9560   return SDValue();
9561 }
9562
9563 /// PerformFANDCombine - Do target-specific dag combines on X86ISD::FAND nodes.
9564 static SDValue PerformFANDCombine(SDNode *N, SelectionDAG &DAG) {
9565   // FAND(0.0, x) -> 0.0
9566   // FAND(x, 0.0) -> 0.0
9567   if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0)))
9568     if (C->getValueAPF().isPosZero())
9569       return N->getOperand(0);
9570   if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1)))
9571     if (C->getValueAPF().isPosZero())
9572       return N->getOperand(1);
9573   return SDValue();
9574 }
9575
9576 static SDValue PerformBTCombine(SDNode *N,
9577                                 SelectionDAG &DAG,
9578                                 TargetLowering::DAGCombinerInfo &DCI) {
9579   // BT ignores high bits in the bit index operand.
9580   SDValue Op1 = N->getOperand(1);
9581   if (Op1.hasOneUse()) {
9582     unsigned BitWidth = Op1.getValueSizeInBits();
9583     APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth));
9584     APInt KnownZero, KnownOne;
9585     TargetLowering::TargetLoweringOpt TLO(DAG);
9586     TargetLowering &TLI = DAG.getTargetLoweringInfo();
9587     if (TLO.ShrinkDemandedConstant(Op1, DemandedMask) ||
9588         TLI.SimplifyDemandedBits(Op1, DemandedMask, KnownZero, KnownOne, TLO))
9589       DCI.CommitTargetLoweringOpt(TLO);
9590   }
9591   return SDValue();
9592 }
9593
9594 static SDValue PerformVZEXT_MOVLCombine(SDNode *N, SelectionDAG &DAG) {
9595   SDValue Op = N->getOperand(0);
9596   if (Op.getOpcode() == ISD::BIT_CONVERT)
9597     Op = Op.getOperand(0);
9598   EVT VT = N->getValueType(0), OpVT = Op.getValueType();
9599   if (Op.getOpcode() == X86ISD::VZEXT_LOAD &&
9600       VT.getVectorElementType().getSizeInBits() ==
9601       OpVT.getVectorElementType().getSizeInBits()) {
9602     return DAG.getNode(ISD::BIT_CONVERT, N->getDebugLoc(), VT, Op);
9603   }
9604   return SDValue();
9605 }
9606
9607 // On X86 and X86-64, atomic operations are lowered to locked instructions.
9608 // Locked instructions, in turn, have implicit fence semantics (all memory
9609 // operations are flushed before issuing the locked instruction, and the
9610 // are not buffered), so we can fold away the common pattern of
9611 // fence-atomic-fence.
9612 static SDValue PerformMEMBARRIERCombine(SDNode* N, SelectionDAG &DAG) {
9613   SDValue atomic = N->getOperand(0);
9614   switch (atomic.getOpcode()) {
9615     case ISD::ATOMIC_CMP_SWAP:
9616     case ISD::ATOMIC_SWAP:
9617     case ISD::ATOMIC_LOAD_ADD:
9618     case ISD::ATOMIC_LOAD_SUB:
9619     case ISD::ATOMIC_LOAD_AND:
9620     case ISD::ATOMIC_LOAD_OR:
9621     case ISD::ATOMIC_LOAD_XOR:
9622     case ISD::ATOMIC_LOAD_NAND:
9623     case ISD::ATOMIC_LOAD_MIN:
9624     case ISD::ATOMIC_LOAD_MAX:
9625     case ISD::ATOMIC_LOAD_UMIN:
9626     case ISD::ATOMIC_LOAD_UMAX:
9627       break;
9628     default:
9629       return SDValue();
9630   }
9631
9632   SDValue fence = atomic.getOperand(0);
9633   if (fence.getOpcode() != ISD::MEMBARRIER)
9634     return SDValue();
9635
9636   switch (atomic.getOpcode()) {
9637     case ISD::ATOMIC_CMP_SWAP:
9638       return DAG.UpdateNodeOperands(atomic, fence.getOperand(0),
9639                                     atomic.getOperand(1), atomic.getOperand(2),
9640                                     atomic.getOperand(3));
9641     case ISD::ATOMIC_SWAP:
9642     case ISD::ATOMIC_LOAD_ADD:
9643     case ISD::ATOMIC_LOAD_SUB:
9644     case ISD::ATOMIC_LOAD_AND:
9645     case ISD::ATOMIC_LOAD_OR:
9646     case ISD::ATOMIC_LOAD_XOR:
9647     case ISD::ATOMIC_LOAD_NAND:
9648     case ISD::ATOMIC_LOAD_MIN:
9649     case ISD::ATOMIC_LOAD_MAX:
9650     case ISD::ATOMIC_LOAD_UMIN:
9651     case ISD::ATOMIC_LOAD_UMAX:
9652       return DAG.UpdateNodeOperands(atomic, fence.getOperand(0),
9653                                     atomic.getOperand(1), atomic.getOperand(2));
9654     default:
9655       return SDValue();
9656   }
9657 }
9658
9659 static SDValue PerformZExtCombine(SDNode *N, SelectionDAG &DAG) {
9660   // (i32 zext (and (i8  x86isd::setcc_carry), 1)) ->
9661   //           (and (i32 x86isd::setcc_carry), 1)
9662   // This eliminates the zext. This transformation is necessary because
9663   // ISD::SETCC is always legalized to i8.
9664   DebugLoc dl = N->getDebugLoc();
9665   SDValue N0 = N->getOperand(0);
9666   EVT VT = N->getValueType(0);
9667   if (N0.getOpcode() == ISD::AND &&
9668       N0.hasOneUse() &&
9669       N0.getOperand(0).hasOneUse()) {
9670     SDValue N00 = N0.getOperand(0);
9671     if (N00.getOpcode() != X86ISD::SETCC_CARRY)
9672       return SDValue();
9673     ConstantSDNode *C = dyn_cast<ConstantSDNode>(N0.getOperand(1));
9674     if (!C || C->getZExtValue() != 1)
9675       return SDValue();
9676     return DAG.getNode(ISD::AND, dl, VT,
9677                        DAG.getNode(X86ISD::SETCC_CARRY, dl, VT,
9678                                    N00.getOperand(0), N00.getOperand(1)),
9679                        DAG.getConstant(1, VT));
9680   }
9681
9682   return SDValue();
9683 }
9684
9685 SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
9686                                              DAGCombinerInfo &DCI) const {
9687   SelectionDAG &DAG = DCI.DAG;
9688   switch (N->getOpcode()) {
9689   default: break;
9690   case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, *this);
9691   case ISD::SELECT:         return PerformSELECTCombine(N, DAG, Subtarget);
9692   case X86ISD::CMOV:        return PerformCMOVCombine(N, DAG, DCI);
9693   case ISD::MUL:            return PerformMulCombine(N, DAG, DCI);
9694   case ISD::SHL:
9695   case ISD::SRA:
9696   case ISD::SRL:            return PerformShiftCombine(N, DAG, Subtarget);
9697   case ISD::OR:             return PerformOrCombine(N, DAG, Subtarget);
9698   case ISD::STORE:          return PerformSTORECombine(N, DAG, Subtarget);
9699   case X86ISD::FXOR:
9700   case X86ISD::FOR:         return PerformFORCombine(N, DAG);
9701   case X86ISD::FAND:        return PerformFANDCombine(N, DAG);
9702   case X86ISD::BT:          return PerformBTCombine(N, DAG, DCI);
9703   case X86ISD::VZEXT_MOVL:  return PerformVZEXT_MOVLCombine(N, DAG);
9704   case ISD::MEMBARRIER:     return PerformMEMBARRIERCombine(N, DAG);
9705   case ISD::ZERO_EXTEND:    return PerformZExtCombine(N, DAG);
9706   }
9707
9708   return SDValue();
9709 }
9710
9711 //===----------------------------------------------------------------------===//
9712 //                           X86 Inline Assembly Support
9713 //===----------------------------------------------------------------------===//
9714
9715 static bool LowerToBSwap(CallInst *CI) {
9716   // FIXME: this should verify that we are targetting a 486 or better.  If not,
9717   // we will turn this bswap into something that will be lowered to logical ops
9718   // instead of emitting the bswap asm.  For now, we don't support 486 or lower
9719   // so don't worry about this.
9720
9721   // Verify this is a simple bswap.
9722   if (CI->getNumOperands() != 2 ||
9723       CI->getType() != CI->getOperand(1)->getType() ||
9724       !CI->getType()->isIntegerTy())
9725     return false;
9726
9727   const IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
9728   if (!Ty || Ty->getBitWidth() % 16 != 0)
9729     return false;
9730
9731   // Okay, we can do this xform, do so now.
9732   const Type *Tys[] = { Ty };
9733   Module *M = CI->getParent()->getParent()->getParent();
9734   Constant *Int = Intrinsic::getDeclaration(M, Intrinsic::bswap, Tys, 1);
9735
9736   Value *Op = CI->getOperand(1);
9737   Op = CallInst::Create(Int, Op, CI->getName(), CI);
9738
9739   CI->replaceAllUsesWith(Op);
9740   CI->eraseFromParent();
9741   return true;
9742 }
9743
9744 bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
9745   InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue());
9746   std::vector<InlineAsm::ConstraintInfo> Constraints = IA->ParseConstraints();
9747
9748   std::string AsmStr = IA->getAsmString();
9749
9750   // TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a"
9751   SmallVector<StringRef, 4> AsmPieces;
9752   SplitString(AsmStr, AsmPieces, "\n");  // ; as separator?
9753
9754   switch (AsmPieces.size()) {
9755   default: return false;
9756   case 1:
9757     AsmStr = AsmPieces[0];
9758     AsmPieces.clear();
9759     SplitString(AsmStr, AsmPieces, " \t");  // Split with whitespace.
9760
9761     // bswap $0
9762     if (AsmPieces.size() == 2 &&
9763         (AsmPieces[0] == "bswap" ||
9764          AsmPieces[0] == "bswapq" ||
9765          AsmPieces[0] == "bswapl") &&
9766         (AsmPieces[1] == "$0" ||
9767          AsmPieces[1] == "${0:q}")) {
9768       // No need to check constraints, nothing other than the equivalent of
9769       // "=r,0" would be valid here.
9770       return LowerToBSwap(CI);
9771     }
9772     // rorw $$8, ${0:w}  -->  llvm.bswap.i16
9773     if (CI->getType()->isIntegerTy(16) &&
9774         AsmPieces.size() == 3 &&
9775         AsmPieces[0] == "rorw" &&
9776         AsmPieces[1] == "$$8," &&
9777         AsmPieces[2] == "${0:w}" &&
9778         IA->getConstraintString() == "=r,0,~{dirflag},~{fpsr},~{flags},~{cc}") {
9779       return LowerToBSwap(CI);
9780     }
9781     break;
9782   case 3:
9783     if (CI->getType()->isIntegerTy(64) &&
9784         Constraints.size() >= 2 &&
9785         Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" &&
9786         Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") {
9787       // bswap %eax / bswap %edx / xchgl %eax, %edx  -> llvm.bswap.i64
9788       SmallVector<StringRef, 4> Words;
9789       SplitString(AsmPieces[0], Words, " \t");
9790       if (Words.size() == 2 && Words[0] == "bswap" && Words[1] == "%eax") {
9791         Words.clear();
9792         SplitString(AsmPieces[1], Words, " \t");
9793         if (Words.size() == 2 && Words[0] == "bswap" && Words[1] == "%edx") {
9794           Words.clear();
9795           SplitString(AsmPieces[2], Words, " \t,");
9796           if (Words.size() == 3 && Words[0] == "xchgl" && Words[1] == "%eax" &&
9797               Words[2] == "%edx") {
9798             return LowerToBSwap(CI);
9799           }
9800         }
9801       }
9802     }
9803     break;
9804   }
9805   return false;
9806 }
9807
9808
9809
9810 /// getConstraintType - Given a constraint letter, return the type of
9811 /// constraint it is for this target.
9812 X86TargetLowering::ConstraintType
9813 X86TargetLowering::getConstraintType(const std::string &Constraint) const {
9814   if (Constraint.size() == 1) {
9815     switch (Constraint[0]) {
9816     case 'A':
9817       return C_Register;
9818     case 'f':
9819     case 'r':
9820     case 'R':
9821     case 'l':
9822     case 'q':
9823     case 'Q':
9824     case 'x':
9825     case 'y':
9826     case 'Y':
9827       return C_RegisterClass;
9828     case 'e':
9829     case 'Z':
9830       return C_Other;
9831     default:
9832       break;
9833     }
9834   }
9835   return TargetLowering::getConstraintType(Constraint);
9836 }
9837
9838 /// LowerXConstraint - try to replace an X constraint, which matches anything,
9839 /// with another that has more specific requirements based on the type of the
9840 /// corresponding operand.
9841 const char *X86TargetLowering::
9842 LowerXConstraint(EVT ConstraintVT) const {
9843   // FP X constraints get lowered to SSE1/2 registers if available, otherwise
9844   // 'f' like normal targets.
9845   if (ConstraintVT.isFloatingPoint()) {
9846     if (Subtarget->hasSSE2())
9847       return "Y";
9848     if (Subtarget->hasSSE1())
9849       return "x";
9850   }
9851
9852   return TargetLowering::LowerXConstraint(ConstraintVT);
9853 }
9854
9855 /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
9856 /// vector.  If it is invalid, don't add anything to Ops.
9857 void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
9858                                                      char Constraint,
9859                                                      bool hasMemory,
9860                                                      std::vector<SDValue>&Ops,
9861                                                      SelectionDAG &DAG) const {
9862   SDValue Result(0, 0);
9863
9864   switch (Constraint) {
9865   default: break;
9866   case 'I':
9867     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
9868       if (C->getZExtValue() <= 31) {
9869         Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
9870         break;
9871       }
9872     }
9873     return;
9874   case 'J':
9875     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
9876       if (C->getZExtValue() <= 63) {
9877         Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
9878         break;
9879       }
9880     }
9881     return;
9882   case 'K':
9883     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
9884       if ((int8_t)C->getSExtValue() == C->getSExtValue()) {
9885         Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
9886         break;
9887       }
9888     }
9889     return;
9890   case 'N':
9891     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
9892       if (C->getZExtValue() <= 255) {
9893         Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
9894         break;
9895       }
9896     }
9897     return;
9898   case 'e': {
9899     // 32-bit signed value
9900     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
9901       const ConstantInt *CI = C->getConstantIntValue();
9902       if (CI->isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
9903                                   C->getSExtValue())) {
9904         // Widen to 64 bits here to get it sign extended.
9905         Result = DAG.getTargetConstant(C->getSExtValue(), MVT::i64);
9906         break;
9907       }
9908     // FIXME gcc accepts some relocatable values here too, but only in certain
9909     // memory models; it's complicated.
9910     }
9911     return;
9912   }
9913   case 'Z': {
9914     // 32-bit unsigned value
9915     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
9916       const ConstantInt *CI = C->getConstantIntValue();
9917       if (CI->isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
9918                                   C->getZExtValue())) {
9919         Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
9920         break;
9921       }
9922     }
9923     // FIXME gcc accepts some relocatable values here too, but only in certain
9924     // memory models; it's complicated.
9925     return;
9926   }
9927   case 'i': {
9928     // Literal immediates are always ok.
9929     if (ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op)) {
9930       // Widen to 64 bits here to get it sign extended.
9931       Result = DAG.getTargetConstant(CST->getSExtValue(), MVT::i64);
9932       break;
9933     }
9934
9935     // If we are in non-pic codegen mode, we allow the address of a global (with
9936     // an optional displacement) to be used with 'i'.
9937     GlobalAddressSDNode *GA = 0;
9938     int64_t Offset = 0;
9939
9940     // Match either (GA), (GA+C), (GA+C1+C2), etc.
9941     while (1) {
9942       if ((GA = dyn_cast<GlobalAddressSDNode>(Op))) {
9943         Offset += GA->getOffset();
9944         break;
9945       } else if (Op.getOpcode() == ISD::ADD) {
9946         if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
9947           Offset += C->getZExtValue();
9948           Op = Op.getOperand(0);
9949           continue;
9950         }
9951       } else if (Op.getOpcode() == ISD::SUB) {
9952         if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
9953           Offset += -C->getZExtValue();
9954           Op = Op.getOperand(0);
9955           continue;
9956         }
9957       }
9958
9959       // Otherwise, this isn't something we can handle, reject it.
9960       return;
9961     }
9962
9963     GlobalValue *GV = GA->getGlobal();
9964     // If we require an extra load to get this address, as in PIC mode, we
9965     // can't accept it.
9966     if (isGlobalStubReference(Subtarget->ClassifyGlobalReference(GV,
9967                                                         getTargetMachine())))
9968       return;
9969
9970     if (hasMemory)
9971       Op = LowerGlobalAddress(GV, Op.getDebugLoc(), Offset, DAG);
9972     else
9973       Op = DAG.getTargetGlobalAddress(GV, GA->getValueType(0), Offset);
9974     Result = Op;
9975     break;
9976   }
9977   }
9978
9979   if (Result.getNode()) {
9980     Ops.push_back(Result);
9981     return;
9982   }
9983   return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, hasMemory,
9984                                                       Ops, DAG);
9985 }
9986
9987 std::vector<unsigned> X86TargetLowering::
9988 getRegClassForInlineAsmConstraint(const std::string &Constraint,
9989                                   EVT VT) const {
9990   if (Constraint.size() == 1) {
9991     // FIXME: not handling fp-stack yet!
9992     switch (Constraint[0]) {      // GCC X86 Constraint Letters
9993     default: break;  // Unknown constraint letter
9994     case 'q':   // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode.
9995       if (Subtarget->is64Bit()) {
9996         if (VT == MVT::i32)
9997           return make_vector<unsigned>(X86::EAX, X86::EDX, X86::ECX, X86::EBX,
9998                                        X86::ESI, X86::EDI, X86::R8D, X86::R9D,
9999                                        X86::R10D,X86::R11D,X86::R12D,
10000                                        X86::R13D,X86::R14D,X86::R15D,
10001                                        X86::EBP, X86::ESP, 0);
10002         else if (VT == MVT::i16)
10003           return make_vector<unsigned>(X86::AX,  X86::DX,  X86::CX, X86::BX,
10004                                        X86::SI,  X86::DI,  X86::R8W,X86::R9W,
10005                                        X86::R10W,X86::R11W,X86::R12W,
10006                                        X86::R13W,X86::R14W,X86::R15W,
10007                                        X86::BP,  X86::SP, 0);
10008         else if (VT == MVT::i8)
10009           return make_vector<unsigned>(X86::AL,  X86::DL,  X86::CL, X86::BL,
10010                                        X86::SIL, X86::DIL, X86::R8B,X86::R9B,
10011                                        X86::R10B,X86::R11B,X86::R12B,
10012                                        X86::R13B,X86::R14B,X86::R15B,
10013                                        X86::BPL, X86::SPL, 0);
10014
10015         else if (VT == MVT::i64)
10016           return make_vector<unsigned>(X86::RAX, X86::RDX, X86::RCX, X86::RBX,
10017                                        X86::RSI, X86::RDI, X86::R8,  X86::R9,
10018                                        X86::R10, X86::R11, X86::R12,
10019                                        X86::R13, X86::R14, X86::R15,
10020                                        X86::RBP, X86::RSP, 0);
10021
10022         break;
10023       }
10024       // 32-bit fallthrough
10025     case 'Q':   // Q_REGS
10026       if (VT == MVT::i32)
10027         return make_vector<unsigned>(X86::EAX, X86::EDX, X86::ECX, X86::EBX, 0);
10028       else if (VT == MVT::i16)
10029         return make_vector<unsigned>(X86::AX, X86::DX, X86::CX, X86::BX, 0);
10030       else if (VT == MVT::i8)
10031         return make_vector<unsigned>(X86::AL, X86::DL, X86::CL, X86::BL, 0);
10032       else if (VT == MVT::i64)
10033         return make_vector<unsigned>(X86::RAX, X86::RDX, X86::RCX, X86::RBX, 0);
10034       break;
10035     }
10036   }
10037
10038   return std::vector<unsigned>();
10039 }
10040
10041 std::pair<unsigned, const TargetRegisterClass*>
10042 X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
10043                                                 EVT VT) const {
10044   // First, see if this is a constraint that directly corresponds to an LLVM
10045   // register class.
10046   if (Constraint.size() == 1) {
10047     // GCC Constraint Letters
10048     switch (Constraint[0]) {
10049     default: break;
10050     case 'r':   // GENERAL_REGS
10051     case 'l':   // INDEX_REGS
10052       if (VT == MVT::i8)
10053         return std::make_pair(0U, X86::GR8RegisterClass);
10054       if (VT == MVT::i16)
10055         return std::make_pair(0U, X86::GR16RegisterClass);
10056       if (VT == MVT::i32 || !Subtarget->is64Bit())
10057         return std::make_pair(0U, X86::GR32RegisterClass);
10058       return std::make_pair(0U, X86::GR64RegisterClass);
10059     case 'R':   // LEGACY_REGS
10060       if (VT == MVT::i8)
10061         return std::make_pair(0U, X86::GR8_NOREXRegisterClass);
10062       if (VT == MVT::i16)
10063         return std::make_pair(0U, X86::GR16_NOREXRegisterClass);
10064       if (VT == MVT::i32 || !Subtarget->is64Bit())
10065         return std::make_pair(0U, X86::GR32_NOREXRegisterClass);
10066       return std::make_pair(0U, X86::GR64_NOREXRegisterClass);
10067     case 'f':  // FP Stack registers.
10068       // If SSE is enabled for this VT, use f80 to ensure the isel moves the
10069       // value to the correct fpstack register class.
10070       if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT))
10071         return std::make_pair(0U, X86::RFP32RegisterClass);
10072       if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT))
10073         return std::make_pair(0U, X86::RFP64RegisterClass);
10074       return std::make_pair(0U, X86::RFP80RegisterClass);
10075     case 'y':   // MMX_REGS if MMX allowed.
10076       if (!Subtarget->hasMMX()) break;
10077       return std::make_pair(0U, X86::VR64RegisterClass);
10078     case 'Y':   // SSE_REGS if SSE2 allowed
10079       if (!Subtarget->hasSSE2()) break;
10080       // FALL THROUGH.
10081     case 'x':   // SSE_REGS if SSE1 allowed
10082       if (!Subtarget->hasSSE1()) break;
10083
10084       switch (VT.getSimpleVT().SimpleTy) {
10085       default: break;
10086       // Scalar SSE types.
10087       case MVT::f32:
10088       case MVT::i32:
10089         return std::make_pair(0U, X86::FR32RegisterClass);
10090       case MVT::f64:
10091       case MVT::i64:
10092         return std::make_pair(0U, X86::FR64RegisterClass);
10093       // Vector types.
10094       case MVT::v16i8:
10095       case MVT::v8i16:
10096       case MVT::v4i32:
10097       case MVT::v2i64:
10098       case MVT::v4f32:
10099       case MVT::v2f64:
10100         return std::make_pair(0U, X86::VR128RegisterClass);
10101       }
10102       break;
10103     }
10104   }
10105
10106   // Use the default implementation in TargetLowering to convert the register
10107   // constraint into a member of a register class.
10108   std::pair<unsigned, const TargetRegisterClass*> Res;
10109   Res = TargetLowering::getRegForInlineAsmConstraint(Constraint, VT);
10110
10111   // Not found as a standard register?
10112   if (Res.second == 0) {
10113     // Map st(0) -> st(7) -> ST0
10114     if (Constraint.size() == 7 && Constraint[0] == '{' &&
10115         tolower(Constraint[1]) == 's' &&
10116         tolower(Constraint[2]) == 't' &&
10117         Constraint[3] == '(' &&
10118         (Constraint[4] >= '0' && Constraint[4] <= '7') &&
10119         Constraint[5] == ')' &&
10120         Constraint[6] == '}') {
10121
10122       Res.first = X86::ST0+Constraint[4]-'0';
10123       Res.second = X86::RFP80RegisterClass;
10124       return Res;
10125     }
10126
10127     // GCC allows "st(0)" to be called just plain "st".
10128     if (StringRef("{st}").equals_lower(Constraint)) {
10129       Res.first = X86::ST0;
10130       Res.second = X86::RFP80RegisterClass;
10131       return Res;
10132     }
10133
10134     // flags -> EFLAGS
10135     if (StringRef("{flags}").equals_lower(Constraint)) {
10136       Res.first = X86::EFLAGS;
10137       Res.second = X86::CCRRegisterClass;
10138       return Res;
10139     }
10140
10141     // 'A' means EAX + EDX.
10142     if (Constraint == "A") {
10143       Res.first = X86::EAX;
10144       Res.second = X86::GR32_ADRegisterClass;
10145       return Res;
10146     }
10147     return Res;
10148   }
10149
10150   // Otherwise, check to see if this is a register class of the wrong value
10151   // type.  For example, we want to map "{ax},i32" -> {eax}, we don't want it to
10152   // turn into {ax},{dx}.
10153   if (Res.second->hasType(VT))
10154     return Res;   // Correct type already, nothing to do.
10155
10156   // All of the single-register GCC register classes map their values onto
10157   // 16-bit register pieces "ax","dx","cx","bx","si","di","bp","sp".  If we
10158   // really want an 8-bit or 32-bit register, map to the appropriate register
10159   // class and return the appropriate register.
10160   if (Res.second == X86::GR16RegisterClass) {
10161     if (VT == MVT::i8) {
10162       unsigned DestReg = 0;
10163       switch (Res.first) {
10164       default: break;
10165       case X86::AX: DestReg = X86::AL; break;
10166       case X86::DX: DestReg = X86::DL; break;
10167       case X86::CX: DestReg = X86::CL; break;
10168       case X86::BX: DestReg = X86::BL; break;
10169       }
10170       if (DestReg) {
10171         Res.first = DestReg;
10172         Res.second = X86::GR8RegisterClass;
10173       }
10174     } else if (VT == MVT::i32) {
10175       unsigned DestReg = 0;
10176       switch (Res.first) {
10177       default: break;
10178       case X86::AX: DestReg = X86::EAX; break;
10179       case X86::DX: DestReg = X86::EDX; break;
10180       case X86::CX: DestReg = X86::ECX; break;
10181       case X86::BX: DestReg = X86::EBX; break;
10182       case X86::SI: DestReg = X86::ESI; break;
10183       case X86::DI: DestReg = X86::EDI; break;
10184       case X86::BP: DestReg = X86::EBP; break;
10185       case X86::SP: DestReg = X86::ESP; break;
10186       }
10187       if (DestReg) {
10188         Res.first = DestReg;
10189         Res.second = X86::GR32RegisterClass;
10190       }
10191     } else if (VT == MVT::i64) {
10192       unsigned DestReg = 0;
10193       switch (Res.first) {
10194       default: break;
10195       case X86::AX: DestReg = X86::RAX; break;
10196       case X86::DX: DestReg = X86::RDX; break;
10197       case X86::CX: DestReg = X86::RCX; break;
10198       case X86::BX: DestReg = X86::RBX; break;
10199       case X86::SI: DestReg = X86::RSI; break;
10200       case X86::DI: DestReg = X86::RDI; break;
10201       case X86::BP: DestReg = X86::RBP; break;
10202       case X86::SP: DestReg = X86::RSP; break;
10203       }
10204       if (DestReg) {
10205         Res.first = DestReg;
10206         Res.second = X86::GR64RegisterClass;
10207       }
10208     }
10209   } else if (Res.second == X86::FR32RegisterClass ||
10210              Res.second == X86::FR64RegisterClass ||
10211              Res.second == X86::VR128RegisterClass) {
10212     // Handle references to XMM physical registers that got mapped into the
10213     // wrong class.  This can happen with constraints like {xmm0} where the
10214     // target independent register mapper will just pick the first match it can
10215     // find, ignoring the required type.
10216     if (VT == MVT::f32)
10217       Res.second = X86::FR32RegisterClass;
10218     else if (VT == MVT::f64)
10219       Res.second = X86::FR64RegisterClass;
10220     else if (X86::VR128RegisterClass->hasType(VT))
10221       Res.second = X86::VR128RegisterClass;
10222   }
10223
10224   return Res;
10225 }
10226
10227 //===----------------------------------------------------------------------===//
10228 //                           X86 Widen vector type
10229 //===----------------------------------------------------------------------===//
10230
10231 /// getWidenVectorType: given a vector type, returns the type to widen
10232 /// to (e.g., v7i8 to v8i8). If the vector type is legal, it returns itself.
10233 /// If there is no vector type that we want to widen to, returns MVT::Other
10234 /// When and where to widen is target dependent based on the cost of
10235 /// scalarizing vs using the wider vector type.
10236
10237 EVT X86TargetLowering::getWidenVectorType(EVT VT) const {
10238   assert(VT.isVector());
10239   if (isTypeLegal(VT))
10240     return VT;
10241
10242   // TODO: In computeRegisterProperty, we can compute the list of legal vector
10243   //       type based on element type.  This would speed up our search (though
10244   //       it may not be worth it since the size of the list is relatively
10245   //       small).
10246   EVT EltVT = VT.getVectorElementType();
10247   unsigned NElts = VT.getVectorNumElements();
10248
10249   // On X86, it make sense to widen any vector wider than 1
10250   if (NElts <= 1)
10251     return MVT::Other;
10252
10253   for (unsigned nVT = MVT::FIRST_VECTOR_VALUETYPE;
10254        nVT <= MVT::LAST_VECTOR_VALUETYPE; ++nVT) {
10255     EVT SVT = (MVT::SimpleValueType)nVT;
10256
10257     if (isTypeLegal(SVT) &&
10258         SVT.getVectorElementType() == EltVT &&
10259         SVT.getVectorNumElements() > NElts)
10260       return SVT;
10261   }
10262   return MVT::Other;
10263 }