lib/Target/CellSPU/SPUISelLowering.cpp

   1 //===-- SPUISelLowering.cpp - Cell SPU DAG Lowering Implementation --------===//
   2 //                     The LLVM Compiler Infrastructure
   3 //
   4 // This file is distributed under the University of Illinois Open Source
   5 // License. See LICENSE.TXT for details.
   6 //
   7 //===----------------------------------------------------------------------===//
   8 //
   9 // This file implements the SPUTargetLowering class.
  10 //
  11 //===----------------------------------------------------------------------===//
  12
  13 #include "SPURegisterNames.h"
  14 #include "SPUISelLowering.h"
  15 #include "SPUTargetMachine.h"
  16 #include "SPUFrameLowering.h"
  17 #include "SPUMachineFunction.h"
  18 #include "llvm/Constants.h"
  19 #include "llvm/Function.h"
  20 #include "llvm/Intrinsics.h"
  21 #include "llvm/CallingConv.h"
  22 #include "llvm/Type.h"
  23 #include "llvm/CodeGen/CallingConvLower.h"
  24 #include "llvm/CodeGen/MachineFrameInfo.h"
  25 #include "llvm/CodeGen/MachineFunction.h"
  26 #include "llvm/CodeGen/MachineInstrBuilder.h"
  27 #include "llvm/CodeGen/MachineRegisterInfo.h"
  28 #include "llvm/CodeGen/SelectionDAG.h"
  29 #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
  30 #include "llvm/Target/TargetOptions.h"
  31 #include "llvm/ADT/VectorExtras.h"
  32 #include "llvm/Support/Debug.h"
  33 #include "llvm/Support/ErrorHandling.h"
  34 #include "llvm/Support/MathExtras.h"
  35 #include "llvm/Support/raw_ostream.h"
  36 #include <map>
  37
  38 using namespace llvm;
  39
  40 // Used in getTargetNodeName() below
  41 namespace {
  42   std::map<unsigned, const char *> node_names;
  43
  44   // Byte offset of the preferred slot (counted from the MSB)
  45   int prefslotOffset(EVT VT) {
  46     int retval=0;
  47     if (VT==MVT::i1) retval=3;
  48     if (VT==MVT::i8) retval=3;
  49     if (VT==MVT::i16) retval=2;
  50
  51     return retval;
  52   }
  53
  54   //! Expand a library call into an actual call DAG node
  55   /*!
  56    \note
  57    This code is taken from SelectionDAGLegalize, since it is not exposed as
  58    part of the LLVM SelectionDAG API.
  59    */
  60
  61   SDValue
  62   ExpandLibCall(RTLIB::Libcall LC, SDValue Op, SelectionDAG &DAG,
  63                 bool isSigned, SDValue &Hi, const SPUTargetLowering &TLI) {
  64     // The input chain to this libcall is the entry node of the function.
  65     // Legalizing the call will automatically add the previous call to the
  66     // dependence.
  67     SDValue InChain = DAG.getEntryNode();
  68
  69     TargetLowering::ArgListTy Args;
  70     TargetLowering::ArgListEntry Entry;
  71     for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
  72       EVT ArgVT = Op.getOperand(i).getValueType();
  73       const Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
  74       Entry.Node = Op.getOperand(i);
  75       Entry.Ty = ArgTy;
  76       Entry.isSExt = isSigned;
  77       Entry.isZExt = !isSigned;
  78       Args.push_back(Entry);
  79     }
  80     SDValue Callee = DAG.getExternalSymbol(TLI.getLibcallName(LC),
  81                                            TLI.getPointerTy());
  82
  83     // Splice the libcall in wherever FindInputOutputChains tells us to.
  84     const Type *RetTy =
  85                 Op.getNode()->getValueType(0).getTypeForEVT(*DAG.getContext());
  86     std::pair<SDValue, SDValue> CallInfo =
  87             TLI.LowerCallTo(InChain, RetTy, isSigned, !isSigned, false, false,
  88                             0, TLI.getLibcallCallingConv(LC), false,
  89                             /*isReturnValueUsed=*/true,
  90                             Callee, Args, DAG, Op.getDebugLoc());
  91
  92     return CallInfo.first;
  93   }
  94 }
  95
  96 SPUTargetLowering::SPUTargetLowering(SPUTargetMachine &TM)
  97   : TargetLowering(TM, new TargetLoweringObjectFileELF()),
  98     SPUTM(TM) {
  99
 100   // Use _setjmp/_longjmp instead of setjmp/longjmp.
 101   setUseUnderscoreSetJmp(true);
 102   setUseUnderscoreLongJmp(true);
 103
 104   // Set RTLIB libcall names as used by SPU:
 105   setLibcallName(RTLIB::DIV_F64, "__fast_divdf3");
 106
 107   // Set up the SPU's register classes:
 108   addRegisterClass(MVT::i8,   SPU::R8CRegisterClass);
 109   addRegisterClass(MVT::i16,  SPU::R16CRegisterClass);
 110   addRegisterClass(MVT::i32,  SPU::R32CRegisterClass);
 111   addRegisterClass(MVT::i64,  SPU::R64CRegisterClass);
 112   addRegisterClass(MVT::f32,  SPU::R32FPRegisterClass);
 113   addRegisterClass(MVT::f64,  SPU::R64FPRegisterClass);
 114   addRegisterClass(MVT::i128, SPU::GPRCRegisterClass);
 115
 116   // SPU has no sign or zero extended loads for i1, i8, i16:
 117   setLoadExtAction(ISD::EXTLOAD,  MVT::i1, Promote);
 118   setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote);
 119   setLoadExtAction(ISD::ZEXTLOAD, MVT::i1, Promote);
 120
 121   setLoadExtAction(ISD::EXTLOAD,  MVT::f32, Expand);
 122   setLoadExtAction(ISD::EXTLOAD,  MVT::f64, Expand);
 123
 124   setTruncStoreAction(MVT::i128, MVT::i64, Expand);
 125   setTruncStoreAction(MVT::i128, MVT::i32, Expand);
 126   setTruncStoreAction(MVT::i128, MVT::i16, Expand);
 127   setTruncStoreAction(MVT::i128, MVT::i8, Expand);
 128
 129   setTruncStoreAction(MVT::f64, MVT::f32, Expand);
 130
 131   // SPU constant load actions are custom lowered:
 132   setOperationAction(ISD::ConstantFP, MVT::f32, Legal);
 133   setOperationAction(ISD::ConstantFP, MVT::f64, Custom);
 134
 135   // SPU's loads and stores have to be custom lowered:
 136   for (unsigned sctype = (unsigned) MVT::i8; sctype < (unsigned) MVT::i128;
 137        ++sctype) {
 138     MVT::SimpleValueType VT = (MVT::SimpleValueType)sctype;
 139
 140     setOperationAction(ISD::LOAD,   VT, Custom);
 141     setOperationAction(ISD::STORE,  VT, Custom);
 142     setLoadExtAction(ISD::EXTLOAD,  VT, Custom);
 143     setLoadExtAction(ISD::ZEXTLOAD, VT, Custom);
 144     setLoadExtAction(ISD::SEXTLOAD, VT, Custom);
 145
 146     for (unsigned stype = sctype - 1; stype >= (unsigned) MVT::i8; --stype) {
 147       MVT::SimpleValueType StoreVT = (MVT::SimpleValueType) stype;
 148       setTruncStoreAction(VT, StoreVT, Expand);
 149     }
 150   }
 151
 152   for (unsigned sctype = (unsigned) MVT::f32; sctype < (unsigned) MVT::f64;
 153        ++sctype) {
 154     MVT::SimpleValueType VT = (MVT::SimpleValueType) sctype;
 155
 156     setOperationAction(ISD::LOAD,   VT, Custom);
 157     setOperationAction(ISD::STORE,  VT, Custom);
 158
 159     for (unsigned stype = sctype - 1; stype >= (unsigned) MVT::f32; --stype) {
 160       MVT::SimpleValueType StoreVT = (MVT::SimpleValueType) stype;
 161       setTruncStoreAction(VT, StoreVT, Expand);
 162     }
 163   }
 164
 165   // Expand the jumptable branches
 166   setOperationAction(ISD::BR_JT,        MVT::Other, Expand);
 167   setOperationAction(ISD::BR_CC,        MVT::Other, Expand);
 168
 169   // Custom lower SELECT_CC for most cases, but expand by default
 170   setOperationAction(ISD::SELECT_CC,    MVT::Other, Expand);
 171   setOperationAction(ISD::SELECT_CC,    MVT::i8,    Custom);
 172   setOperationAction(ISD::SELECT_CC,    MVT::i16,   Custom);
 173   setOperationAction(ISD::SELECT_CC,    MVT::i32,   Custom);
 174   setOperationAction(ISD::SELECT_CC,    MVT::i64,   Custom);
 175
 176   // SPU has no intrinsics for these particular operations:
 177   setOperationAction(ISD::MEMBARRIER, MVT::Other, Expand);
 178
 179   // SPU has no division/remainder instructions
 180   setOperationAction(ISD::SREM,    MVT::i8,   Expand);
 181   setOperationAction(ISD::UREM,    MVT::i8,   Expand);
 182   setOperationAction(ISD::SDIV,    MVT::i8,   Expand);
 183   setOperationAction(ISD::UDIV,    MVT::i8,   Expand);
 184   setOperationAction(ISD::SDIVREM, MVT::i8,   Expand);
 185   setOperationAction(ISD::UDIVREM, MVT::i8,   Expand);
 186   setOperationAction(ISD::SREM,    MVT::i16,  Expand);
 187   setOperationAction(ISD::UREM,    MVT::i16,  Expand);
 188   setOperationAction(ISD::SDIV,    MVT::i16,  Expand);
 189   setOperationAction(ISD::UDIV,    MVT::i16,  Expand);
 190   setOperationAction(ISD::SDIVREM, MVT::i16,  Expand);
 191   setOperationAction(ISD::UDIVREM, MVT::i16,  Expand);
 192   setOperationAction(ISD::SREM,    MVT::i32,  Expand);
 193   setOperationAction(ISD::UREM,    MVT::i32,  Expand);
 194   setOperationAction(ISD::SDIV,    MVT::i32,  Expand);
 195   setOperationAction(ISD::UDIV,    MVT::i32,  Expand);
 196   setOperationAction(ISD::SDIVREM, MVT::i32,  Expand);
 197   setOperationAction(ISD::UDIVREM, MVT::i32,  Expand);
 198   setOperationAction(ISD::SREM,    MVT::i64,  Expand);
 199   setOperationAction(ISD::UREM,    MVT::i64,  Expand);
 200   setOperationAction(ISD::SDIV,    MVT::i64,  Expand);
 201   setOperationAction(ISD::UDIV,    MVT::i64,  Expand);
 202   setOperationAction(ISD::SDIVREM, MVT::i64,  Expand);
 203   setOperationAction(ISD::UDIVREM, MVT::i64,  Expand);
 204   setOperationAction(ISD::SREM,    MVT::i128, Expand);
 205   setOperationAction(ISD::UREM,    MVT::i128, Expand);
 206   setOperationAction(ISD::SDIV,    MVT::i128, Expand);
 207   setOperationAction(ISD::UDIV,    MVT::i128, Expand);
 208   setOperationAction(ISD::SDIVREM, MVT::i128, Expand);
 209   setOperationAction(ISD::UDIVREM, MVT::i128, Expand);
 210
 211   // We don't support sin/cos/sqrt/fmod
 212   setOperationAction(ISD::FSIN , MVT::f64, Expand);
 213   setOperationAction(ISD::FCOS , MVT::f64, Expand);
 214   setOperationAction(ISD::FREM , MVT::f64, Expand);
 215   setOperationAction(ISD::FSIN , MVT::f32, Expand);
 216   setOperationAction(ISD::FCOS , MVT::f32, Expand);
 217   setOperationAction(ISD::FREM , MVT::f32, Expand);
 218
 219   // Expand fsqrt to the appropriate libcall (NOTE: should use h/w fsqrt
 220   // for f32!)
 221   setOperationAction(ISD::FSQRT, MVT::f64, Expand);
 222   setOperationAction(ISD::FSQRT, MVT::f32, Expand);
 223
 224   setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
 225   setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
 226
 227   // SPU can do rotate right and left, so legalize it... but customize for i8
 228   // because instructions don't exist.
 229
 230   // FIXME: Change from "expand" to appropriate type once ROTR is supported in
 231   //        .td files.
 232   setOperationAction(ISD::ROTR, MVT::i32,    Expand /*Legal*/);
 233   setOperationAction(ISD::ROTR, MVT::i16,    Expand /*Legal*/);
 234   setOperationAction(ISD::ROTR, MVT::i8,     Expand /*Custom*/);
 235
 236   setOperationAction(ISD::ROTL, MVT::i32,    Legal);
 237   setOperationAction(ISD::ROTL, MVT::i16,    Legal);
 238   setOperationAction(ISD::ROTL, MVT::i8,     Custom);
 239
 240   // SPU has no native version of shift left/right for i8
 241   setOperationAction(ISD::SHL,  MVT::i8,     Custom);
 242   setOperationAction(ISD::SRL,  MVT::i8,     Custom);
 243   setOperationAction(ISD::SRA,  MVT::i8,     Custom);
 244
 245   // Make these operations legal and handle them during instruction selection:
 246   setOperationAction(ISD::SHL,  MVT::i64,    Legal);
 247   setOperationAction(ISD::SRL,  MVT::i64,    Legal);
 248   setOperationAction(ISD::SRA,  MVT::i64,    Legal);
 249
 250   // Custom lower i8, i32 and i64 multiplications
 251   setOperationAction(ISD::MUL,  MVT::i8,     Custom);
 252   setOperationAction(ISD::MUL,  MVT::i32,    Legal);
 253   setOperationAction(ISD::MUL,  MVT::i64,    Legal);
 254
 255   // Expand double-width multiplication
 256   // FIXME: It would probably be reasonable to support some of these operations
 257   setOperationAction(ISD::UMUL_LOHI, MVT::i8,  Expand);
 258   setOperationAction(ISD::SMUL_LOHI, MVT::i8,  Expand);
 259   setOperationAction(ISD::MULHU,     MVT::i8,  Expand);
 260   setOperationAction(ISD::MULHS,     MVT::i8,  Expand);
 261   setOperationAction(ISD::UMUL_LOHI, MVT::i16, Expand);
 262   setOperationAction(ISD::SMUL_LOHI, MVT::i16, Expand);
 263   setOperationAction(ISD::MULHU,     MVT::i16, Expand);
 264   setOperationAction(ISD::MULHS,     MVT::i16, Expand);
 265   setOperationAction(ISD::UMUL_LOHI, MVT::i32, Expand);
 266   setOperationAction(ISD::SMUL_LOHI, MVT::i32, Expand);
 267   setOperationAction(ISD::MULHU,     MVT::i32, Expand);
 268   setOperationAction(ISD::MULHS,     MVT::i32, Expand);
 269   setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand);
 270   setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand);
 271   setOperationAction(ISD::MULHU,     MVT::i64, Expand);
 272   setOperationAction(ISD::MULHS,     MVT::i64, Expand);
 273
 274   // Need to custom handle (some) common i8, i64 math ops
 275   setOperationAction(ISD::ADD,  MVT::i8,     Custom);
 276   setOperationAction(ISD::ADD,  MVT::i64,    Legal);
 277   setOperationAction(ISD::SUB,  MVT::i8,     Custom);
 278   setOperationAction(ISD::SUB,  MVT::i64,    Legal);
 279
 280   // SPU does not have BSWAP. It does have i32 support CTLZ.
 281   // CTPOP has to be custom lowered.
 282   setOperationAction(ISD::BSWAP, MVT::i32,   Expand);
 283   setOperationAction(ISD::BSWAP, MVT::i64,   Expand);
 284
 285   setOperationAction(ISD::CTPOP, MVT::i8,    Custom);
 286   setOperationAction(ISD::CTPOP, MVT::i16,   Custom);
 287   setOperationAction(ISD::CTPOP, MVT::i32,   Custom);
 288   setOperationAction(ISD::CTPOP, MVT::i64,   Custom);
 289   setOperationAction(ISD::CTPOP, MVT::i128,  Expand);
 290
 291   setOperationAction(ISD::CTTZ , MVT::i8,    Expand);
 292   setOperationAction(ISD::CTTZ , MVT::i16,   Expand);
 293   setOperationAction(ISD::CTTZ , MVT::i32,   Expand);
 294   setOperationAction(ISD::CTTZ , MVT::i64,   Expand);
 295   setOperationAction(ISD::CTTZ , MVT::i128,  Expand);
 296
 297   setOperationAction(ISD::CTLZ , MVT::i8,    Promote);
 298   setOperationAction(ISD::CTLZ , MVT::i16,   Promote);
 299   setOperationAction(ISD::CTLZ , MVT::i32,   Legal);
 300   setOperationAction(ISD::CTLZ , MVT::i64,   Expand);
 301   setOperationAction(ISD::CTLZ , MVT::i128,  Expand);
 302
 303   // SPU has a version of select that implements (a&~c)|(b&c), just like
 304   // select ought to work:
 305   setOperationAction(ISD::SELECT, MVT::i8,   Legal);
 306   setOperationAction(ISD::SELECT, MVT::i16,  Legal);
 307   setOperationAction(ISD::SELECT, MVT::i32,  Legal);
 308   setOperationAction(ISD::SELECT, MVT::i64,  Legal);
 309
 310   setOperationAction(ISD::SETCC, MVT::i8,    Legal);
 311   setOperationAction(ISD::SETCC, MVT::i16,   Legal);
 312   setOperationAction(ISD::SETCC, MVT::i32,   Legal);
 313   setOperationAction(ISD::SETCC, MVT::i64,   Legal);
 314   setOperationAction(ISD::SETCC, MVT::f64,   Custom);
 315
 316   // Custom lower i128 -> i64 truncates
 317   setOperationAction(ISD::TRUNCATE, MVT::i64, Custom);
 318
 319   // Custom lower i32/i64 -> i128 sign extend
 320   setOperationAction(ISD::SIGN_EXTEND, MVT::i128, Custom);
 321
 322   setOperationAction(ISD::FP_TO_SINT, MVT::i8, Promote);
 323   setOperationAction(ISD::FP_TO_UINT, MVT::i8, Promote);
 324   setOperationAction(ISD::FP_TO_SINT, MVT::i16, Promote);
 325   setOperationAction(ISD::FP_TO_UINT, MVT::i16, Promote);
 326   // SPU has a legal FP -> signed INT instruction for f32, but for f64, need
 327   // to expand to a libcall, hence the custom lowering:
 328   setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);
 329   setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);
 330   setOperationAction(ISD::FP_TO_SINT, MVT::i64, Expand);
 331   setOperationAction(ISD::FP_TO_UINT, MVT::i64, Expand);
 332   setOperationAction(ISD::FP_TO_SINT, MVT::i128, Expand);
 333   setOperationAction(ISD::FP_TO_UINT, MVT::i128, Expand);
 334
 335   // FDIV on SPU requires custom lowering
 336   setOperationAction(ISD::FDIV, MVT::f64, Expand);      // to libcall
 337
 338   // SPU has [U|S]INT_TO_FP for f32->i32, but not for f64->i32, f64->i64:
 339   setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
 340   setOperationAction(ISD::SINT_TO_FP, MVT::i16, Promote);
 341   setOperationAction(ISD::SINT_TO_FP, MVT::i8,  Promote);
 342   setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom);
 343   setOperationAction(ISD::UINT_TO_FP, MVT::i16, Promote);
 344   setOperationAction(ISD::UINT_TO_FP, MVT::i8,  Promote);
 345   setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);
 346   setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom);
 347
 348   setOperationAction(ISD::BITCAST, MVT::i32, Legal);
 349   setOperationAction(ISD::BITCAST, MVT::f32, Legal);
 350   setOperationAction(ISD::BITCAST, MVT::i64, Legal);
 351   setOperationAction(ISD::BITCAST, MVT::f64, Legal);
 352
 353   // We cannot sextinreg(i1).  Expand to shifts.
 354   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
 355
 356   // We want to legalize GlobalAddress and ConstantPool nodes into the
 357   // appropriate instructions to materialize the address.
 358   for (unsigned sctype = (unsigned) MVT::i8; sctype < (unsigned) MVT::f128;
 359        ++sctype) {
 360     MVT::SimpleValueType VT = (MVT::SimpleValueType)sctype;
 361
 362     setOperationAction(ISD::GlobalAddress,  VT, Custom);
 363     setOperationAction(ISD::ConstantPool,   VT, Custom);
 364     setOperationAction(ISD::JumpTable,      VT, Custom);
 365   }
 366
 367   // VASTART needs to be custom lowered to use the VarArgsFrameIndex
 368   setOperationAction(ISD::VASTART           , MVT::Other, Custom);
 369
 370   // Use the default implementation.
 371   setOperationAction(ISD::VAARG             , MVT::Other, Expand);
 372   setOperationAction(ISD::VACOPY            , MVT::Other, Expand);
 373   setOperationAction(ISD::VAEND             , MVT::Other, Expand);
 374   setOperationAction(ISD::STACKSAVE         , MVT::Other, Expand);
 375   setOperationAction(ISD::STACKRESTORE      , MVT::Other, Expand);
 376   setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32  , Expand);
 377   setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64  , Expand);
 378
 379   // Cell SPU has instructions for converting between i64 and fp.
 380   setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
 381   setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);
 382
 383   // To take advantage of the above i64 FP_TO_SINT, promote i32 FP_TO_UINT
 384   setOperationAction(ISD::FP_TO_UINT, MVT::i32, Promote);
 385
 386   // BUILD_PAIR can't be handled natively, and should be expanded to shl/or
 387   setOperationAction(ISD::BUILD_PAIR, MVT::i64, Expand);
 388
 389   // First set operation action for all vector types to expand. Then we
 390   // will selectively turn on ones that can be effectively codegen'd.
 391   addRegisterClass(MVT::v16i8, SPU::VECREGRegisterClass);
 392   addRegisterClass(MVT::v8i16, SPU::VECREGRegisterClass);
 393   addRegisterClass(MVT::v4i32, SPU::VECREGRegisterClass);
 394   addRegisterClass(MVT::v2i64, SPU::VECREGRegisterClass);
 395   addRegisterClass(MVT::v4f32, SPU::VECREGRegisterClass);
 396   addRegisterClass(MVT::v2f64, SPU::VECREGRegisterClass);
 397
 398   for (unsigned i = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
 399        i <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++i) {
 400     MVT::SimpleValueType VT = (MVT::SimpleValueType)i;
 401
 402     // add/sub are legal for all supported vector VT's.
 403     setOperationAction(ISD::ADD,     VT, Legal);
 404     setOperationAction(ISD::SUB,     VT, Legal);
 405     // mul has to be custom lowered.
 406     setOperationAction(ISD::MUL,     VT, Legal);
 407
 408     setOperationAction(ISD::AND,     VT, Legal);
 409     setOperationAction(ISD::OR,      VT, Legal);
 410     setOperationAction(ISD::XOR,     VT, Legal);
 411     setOperationAction(ISD::LOAD,    VT, Custom);
 412     setOperationAction(ISD::SELECT,  VT, Legal);
 413     setOperationAction(ISD::STORE,   VT, Custom);
 414
 415     // These operations need to be expanded:
 416     setOperationAction(ISD::SDIV,    VT, Expand);
 417     setOperationAction(ISD::SREM,    VT, Expand);
 418     setOperationAction(ISD::UDIV,    VT, Expand);
 419     setOperationAction(ISD::UREM,    VT, Expand);
 420
 421     // Custom lower build_vector, constant pool spills, insert and
 422     // extract vector elements:
 423     setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
 424     setOperationAction(ISD::ConstantPool, VT, Custom);
 425     setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
 426     setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
 427     setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
 428     setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
 429   }
 430
 431   setOperationAction(ISD::AND, MVT::v16i8, Custom);
 432   setOperationAction(ISD::OR,  MVT::v16i8, Custom);
 433   setOperationAction(ISD::XOR, MVT::v16i8, Custom);
 434   setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4f32, Custom);
 435
 436   setOperationAction(ISD::FDIV, MVT::v4f32, Legal);
 437
 438   setBooleanContents(ZeroOrNegativeOneBooleanContent);
 439
 440   setStackPointerRegisterToSaveRestore(SPU::R1);
 441
 442   // We have target-specific dag combine patterns for the following nodes:
 443   setTargetDAGCombine(ISD::ADD);
 444   setTargetDAGCombine(ISD::ZERO_EXTEND);
 445   setTargetDAGCombine(ISD::SIGN_EXTEND);
 446   setTargetDAGCombine(ISD::ANY_EXTEND);
 447
 448   setMinFunctionAlignment(3);
 449
 450   computeRegisterProperties();
 451
 452   // Set pre-RA register scheduler default to BURR, which produces slightly
 453   // better code than the default (could also be TDRR, but TargetLowering.h
 454   // needs a mod to support that model):
 455   setSchedulingPreference(Sched::RegPressure);
 456 }
 457
 458 const char *
 459 SPUTargetLowering::getTargetNodeName(unsigned Opcode) const
 460 {
 461   if (node_names.empty()) {
 462     node_names[(unsigned) SPUISD::RET_FLAG] = "SPUISD::RET_FLAG";
 463     node_names[(unsigned) SPUISD::Hi] = "SPUISD::Hi";
 464     node_names[(unsigned) SPUISD::Lo] = "SPUISD::Lo";
 465     node_names[(unsigned) SPUISD::PCRelAddr] = "SPUISD::PCRelAddr";
 466     node_names[(unsigned) SPUISD::AFormAddr] = "SPUISD::AFormAddr";
 467     node_names[(unsigned) SPUISD::IndirectAddr] = "SPUISD::IndirectAddr";
 468     node_names[(unsigned) SPUISD::LDRESULT] = "SPUISD::LDRESULT";
 469     node_names[(unsigned) SPUISD::CALL] = "SPUISD::CALL";
 470     node_names[(unsigned) SPUISD::SHUFB] = "SPUISD::SHUFB";
 471     node_names[(unsigned) SPUISD::SHUFFLE_MASK] = "SPUISD::SHUFFLE_MASK";
 472     node_names[(unsigned) SPUISD::CNTB] = "SPUISD::CNTB";
 473     node_names[(unsigned) SPUISD::PREFSLOT2VEC] = "SPUISD::PREFSLOT2VEC";
 474     node_names[(unsigned) SPUISD::VEC2PREFSLOT] = "SPUISD::VEC2PREFSLOT";
 475     node_names[(unsigned) SPUISD::SHL_BITS] = "SPUISD::SHL_BITS";
 476     node_names[(unsigned) SPUISD::SHL_BYTES] = "SPUISD::SHL_BYTES";
 477     node_names[(unsigned) SPUISD::VEC_ROTL] = "SPUISD::VEC_ROTL";
 478     node_names[(unsigned) SPUISD::VEC_ROTR] = "SPUISD::VEC_ROTR";
 479     node_names[(unsigned) SPUISD::ROTBYTES_LEFT] = "SPUISD::ROTBYTES_LEFT";
 480     node_names[(unsigned) SPUISD::ROTBYTES_LEFT_BITS] =
 481             "SPUISD::ROTBYTES_LEFT_BITS";
 482     node_names[(unsigned) SPUISD::SELECT_MASK] = "SPUISD::SELECT_MASK";
 483     node_names[(unsigned) SPUISD::SELB] = "SPUISD::SELB";
 484     node_names[(unsigned) SPUISD::ADD64_MARKER] = "SPUISD::ADD64_MARKER";
 485     node_names[(unsigned) SPUISD::SUB64_MARKER] = "SPUISD::SUB64_MARKER";
 486     node_names[(unsigned) SPUISD::MUL64_MARKER] = "SPUISD::MUL64_MARKER";
 487   }
 488
 489   std::map<unsigned, const char *>::iterator i = node_names.find(Opcode);
 490
 491   return ((i != node_names.end()) ? i->second : 0);
 492 }
 493
 494 //===----------------------------------------------------------------------===//
 495 // Return the Cell SPU's SETCC result type
 496 //===----------------------------------------------------------------------===//
 497
 498 MVT::SimpleValueType SPUTargetLowering::getSetCCResultType(EVT VT) const {
 499   // i8, i16 and i32 are valid SETCC result types
 500   MVT::SimpleValueType retval;
 501
 502   switch(VT.getSimpleVT().SimpleTy){
 503     case MVT::i1:
 504     case MVT::i8:
 505       retval = MVT::i8; break;
 506     case MVT::i16:
 507       retval = MVT::i16; break;
 508     case MVT::i32:
 509     default:
 510       retval = MVT::i32;
 511   }
 512   return retval;
 513 }
 514
 515 //===----------------------------------------------------------------------===//
 516 // Calling convention code:
 517 //===----------------------------------------------------------------------===//
 518
 519 #include "SPUGenCallingConv.inc"
 520
 521 //===----------------------------------------------------------------------===//
 522 //  LowerOperation implementation
 523 //===----------------------------------------------------------------------===//
 524
 525 /// Custom lower loads for CellSPU
 526 /*!
 527  All CellSPU loads and stores are aligned to 16-byte boundaries, so for elements
 528  within a 16-byte block, we have to rotate to extract the requested element.
 529
 530  For extending loads, we also want to ensure that the following sequence is
 531  emitted, e.g. for MVT::f32 extending load to MVT::f64:
 532
 533 \verbatim
 534 %1  v16i8,ch = load
 535 %2  v16i8,ch = rotate %1
 536 %3  v4f8, ch = bitconvert %2
 537 %4  f32      = vec2perfslot %3
 538 %5  f64      = fp_extend %4
 539 \endverbatim
 540 */
 541 static SDValue
 542 LowerLOAD(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) {
 543   LoadSDNode *LN = cast<LoadSDNode>(Op);
 544   SDValue the_chain = LN->getChain();
 545   EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
 546   EVT InVT = LN->getMemoryVT();
 547   EVT OutVT = Op.getValueType();
 548   ISD::LoadExtType ExtType = LN->getExtensionType();
 549   unsigned alignment = LN->getAlignment();
 550   int pso = prefslotOffset(InVT);
 551   DebugLoc dl = Op.getDebugLoc();
 552   EVT vecVT = InVT.isVector()? InVT: EVT::getVectorVT(*DAG.getContext(), InVT,
 553                                                   (128 / InVT.getSizeInBits()));
 554
 555   // two sanity checks
 556   assert( LN->getAddressingMode() == ISD::UNINDEXED
 557           && "we should get only UNINDEXED adresses");
 558   // clean aligned loads can be selected as-is
 559   if (InVT.getSizeInBits() == 128 && (alignment%16) == 0)
 560     return SDValue();
 561
 562   // Get pointerinfos to the memory chunk(s) that contain the data to load
 563   uint64_t mpi_offset = LN->getPointerInfo().Offset;
 564   mpi_offset -= mpi_offset%16;
 565   MachinePointerInfo lowMemPtr(LN->getPointerInfo().V, mpi_offset);
 566   MachinePointerInfo highMemPtr(LN->getPointerInfo().V, mpi_offset+16);
 567
 568   SDValue result;
 569   SDValue basePtr = LN->getBasePtr();
 570   SDValue rotate;
 571
 572   if ((alignment%16) == 0) {
 573     ConstantSDNode *CN;
 574
 575     // Special cases for a known aligned load to simplify the base pointer
 576     // and the rotation amount:
 577     if (basePtr.getOpcode() == ISD::ADD
 578         && (CN = dyn_cast<ConstantSDNode > (basePtr.getOperand(1))) != 0) {
 579       // Known offset into basePtr
 580       int64_t offset = CN->getSExtValue();
 581       int64_t rotamt = int64_t((offset & 0xf) - pso);
 582
 583       if (rotamt < 0)
 584         rotamt += 16;
 585
 586       rotate = DAG.getConstant(rotamt, MVT::i16);
 587
 588       // Simplify the base pointer for this case:
 589       basePtr = basePtr.getOperand(0);
 590       if ((offset & ~0xf) > 0) {
 591         basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT,
 592                               basePtr,
 593                               DAG.getConstant((offset & ~0xf), PtrVT));
 594       }
 595     } else if ((basePtr.getOpcode() == SPUISD::AFormAddr)
 596                || (basePtr.getOpcode() == SPUISD::IndirectAddr
 597                    && basePtr.getOperand(0).getOpcode() == SPUISD::Hi
 598                    && basePtr.getOperand(1).getOpcode() == SPUISD::Lo)) {
 599       // Plain aligned a-form address: rotate into preferred slot
 600       // Same for (SPUindirect (SPUhi ...), (SPUlo ...))
 601       int64_t rotamt = -pso;
 602       if (rotamt < 0)
 603         rotamt += 16;
 604       rotate = DAG.getConstant(rotamt, MVT::i16);
 605     } else {
 606       // Offset the rotate amount by the basePtr and the preferred slot
 607       // byte offset
 608       int64_t rotamt = -pso;
 609       if (rotamt < 0)
 610         rotamt += 16;
 611       rotate = DAG.getNode(ISD::ADD, dl, PtrVT,
 612                            basePtr,
 613                            DAG.getConstant(rotamt, PtrVT));
 614     }
 615   } else {
 616     // Unaligned load: must be more pessimistic about addressing modes:
 617     if (basePtr.getOpcode() == ISD::ADD) {
 618       MachineFunction &MF = DAG.getMachineFunction();
 619       MachineRegisterInfo &RegInfo = MF.getRegInfo();
 620       unsigned VReg = RegInfo.createVirtualRegister(&SPU::R32CRegClass);
 621       SDValue Flag;
 622
 623       SDValue Op0 = basePtr.getOperand(0);
 624       SDValue Op1 = basePtr.getOperand(1);
 625
 626       if (isa<ConstantSDNode>(Op1)) {
 627         // Convert the (add <ptr>, <const>) to an indirect address contained
 628         // in a register. Note that this is done because we need to avoid
 629         // creating a 0(reg) d-form address due to the SPU's block loads.
 630         basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, Op0, Op1);
 631         the_chain = DAG.getCopyToReg(the_chain, dl, VReg, basePtr, Flag);
 632         basePtr = DAG.getCopyFromReg(the_chain, dl, VReg, PtrVT);
 633       } else {
 634         // Convert the (add <arg1>, <arg2>) to an indirect address, which
 635         // will likely be lowered as a reg(reg) x-form address.
 636         basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, Op0, Op1);
 637       }
 638     } else {
 639       basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT,
 640                             basePtr,
 641                             DAG.getConstant(0, PtrVT));
 642    }
 643
 644     // Offset the rotate amount by the basePtr and the preferred slot
 645     // byte offset
 646     rotate = DAG.getNode(ISD::ADD, dl, PtrVT,
 647                          basePtr,
 648                          DAG.getConstant(-pso, PtrVT));
 649   }
 650
 651   // Do the load as a i128 to allow possible shifting
 652   SDValue low = DAG.getLoad(MVT::i128, dl, the_chain, basePtr,
 653                        lowMemPtr,
 654                        LN->isVolatile(), LN->isNonTemporal(), 16);
 655
 656   // When the size is not greater than alignment we get all data with just
 657   // one load
 658   if (alignment >= InVT.getSizeInBits()/8) {
 659     // Update the chain
 660     the_chain = low.getValue(1);
 661
 662     // Rotate into the preferred slot:
 663     result = DAG.getNode(SPUISD::ROTBYTES_LEFT, dl, MVT::i128,
 664                          low.getValue(0), rotate);
 665
 666     // Convert the loaded v16i8 vector to the appropriate vector type
 667     // specified by the operand:
 668     EVT vecVT = EVT::getVectorVT(*DAG.getContext(),
 669                                  InVT, (128 / InVT.getSizeInBits()));
 670     result = DAG.getNode(SPUISD::VEC2PREFSLOT, dl, InVT,
 671                          DAG.getNode(ISD::BITCAST, dl, vecVT, result));
 672   }
 673   // When alignment is less than the size, we might need (known only at
 674   // run-time) two loads
 675   // TODO: if the memory address is composed only from constants, we have
 676   // extra kowledge, and might avoid the second load
 677   else {
 678     // storage position offset from lower 16 byte aligned memory chunk
 679     SDValue offset = DAG.getNode(ISD::AND, dl, MVT::i32,
 680                                   basePtr, DAG.getConstant( 0xf, MVT::i32 ) );
 681     // get a registerfull of ones. (this implementation is a workaround: LLVM
 682     // cannot handle 128 bit signed int constants)
 683     SDValue ones = DAG.getConstant(-1, MVT::v4i32 );
 684     ones = DAG.getNode(ISD::BITCAST, dl, MVT::i128, ones);
 685
 686     SDValue high = DAG.getLoad(MVT::i128, dl, the_chain,
 687                                DAG.getNode(ISD::ADD, dl, PtrVT,
 688                                            basePtr,
 689                                            DAG.getConstant(16, PtrVT)),
 690                                highMemPtr,
 691                                LN->isVolatile(), LN->isNonTemporal(), 16);
 692
 693     the_chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, low.getValue(1),
 694                                                               high.getValue(1));
 695
 696     // Shift the (possible) high part right to compensate the misalignemnt.
 697     // if there is no highpart (i.e. value is i64 and offset is 4), this
 698     // will zero out the high value.
 699     high = DAG.getNode(SPUISD::SRL_BYTES, dl, MVT::i128, high,
 700                                      DAG.getNode(ISD::SUB, dl, MVT::i32,
 701                                                  DAG.getConstant( 16, MVT::i32),
 702                                                  offset
 703                                                 ));
 704
 705     // Shift the low similarly
 706     // TODO: add SPUISD::SHL_BYTES
 707     low = DAG.getNode(SPUISD::SHL_BYTES, dl, MVT::i128, low, offset );
 708
 709     // Merge the two parts
 710     result = DAG.getNode(ISD::BITCAST, dl, vecVT,
 711                           DAG.getNode(ISD::OR, dl, MVT::i128, low, high));
 712
 713     if (!InVT.isVector()) {
 714       result = DAG.getNode(SPUISD::VEC2PREFSLOT, dl, InVT, result );
 715      }
 716
 717   }
 718     // Handle extending loads by extending the scalar result:
 719     if (ExtType == ISD::SEXTLOAD) {
 720       result = DAG.getNode(ISD::SIGN_EXTEND, dl, OutVT, result);
 721     } else if (ExtType == ISD::ZEXTLOAD) {
 722       result = DAG.getNode(ISD::ZERO_EXTEND, dl, OutVT, result);
 723     } else if (ExtType == ISD::EXTLOAD) {
 724       unsigned NewOpc = ISD::ANY_EXTEND;
 725
 726       if (OutVT.isFloatingPoint())
 727         NewOpc = ISD::FP_EXTEND;
 728
 729       result = DAG.getNode(NewOpc, dl, OutVT, result);
 730     }
 731
 732     SDVTList retvts = DAG.getVTList(OutVT, MVT::Other);
 733     SDValue retops[2] = {
 734       result,
 735       the_chain
 736     };
 737
 738     result = DAG.getNode(SPUISD::LDRESULT, dl, retvts,
 739                          retops, sizeof(retops) / sizeof(retops[0]));
 740     return result;
 741 }
 742
 743 /// Custom lower stores for CellSPU
 744 /*!
 745  All CellSPU stores are aligned to 16-byte boundaries, so for elements
 746  within a 16-byte block, we have to generate a shuffle to insert the
 747  requested element into its place, then store the resulting block.
 748  */
 749 static SDValue
 750 LowerSTORE(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) {
 751   StoreSDNode *SN = cast<StoreSDNode>(Op);
 752   SDValue Value = SN->getValue();
 753   EVT VT = Value.getValueType();
 754   EVT StVT = (!SN->isTruncatingStore() ? VT : SN->getMemoryVT());
 755   EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
 756   DebugLoc dl = Op.getDebugLoc();
 757   unsigned alignment = SN->getAlignment();
 758   SDValue result;
 759   EVT vecVT = StVT.isVector()? StVT: EVT::getVectorVT(*DAG.getContext(), StVT,
 760                                                  (128 / StVT.getSizeInBits()));
 761   // Get pointerinfos to the memory chunk(s) that contain the data to load
 762   uint64_t mpi_offset = SN->getPointerInfo().Offset;
 763   mpi_offset -= mpi_offset%16;
 764   MachinePointerInfo lowMemPtr(SN->getPointerInfo().V, mpi_offset);
 765   MachinePointerInfo highMemPtr(SN->getPointerInfo().V, mpi_offset+16);
 766
 767
 768   // two sanity checks
 769   assert( SN->getAddressingMode() == ISD::UNINDEXED
 770           && "we should get only UNINDEXED adresses");
 771   // clean aligned loads can be selected as-is
 772   if (StVT.getSizeInBits() == 128 && (alignment%16) == 0)
 773     return SDValue();
 774
 775   SDValue alignLoadVec;
 776   SDValue basePtr = SN->getBasePtr();
 777   SDValue the_chain = SN->getChain();
 778   SDValue insertEltOffs;
 779
 780   if ((alignment%16) == 0) {
 781     ConstantSDNode *CN;
 782     // Special cases for a known aligned load to simplify the base pointer
 783     // and insertion byte:
 784     if (basePtr.getOpcode() == ISD::ADD
 785         && (CN = dyn_cast<ConstantSDNode>(basePtr.getOperand(1))) != 0) {
 786       // Known offset into basePtr
 787       int64_t offset = CN->getSExtValue();
 788
 789       // Simplify the base pointer for this case:
 790       basePtr = basePtr.getOperand(0);
 791       insertEltOffs = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT,
 792                                   basePtr,
 793                                   DAG.getConstant((offset & 0xf), PtrVT));
 794
 795       if ((offset & ~0xf) > 0) {
 796         basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT,
 797                               basePtr,
 798                               DAG.getConstant((offset & ~0xf), PtrVT));
 799       }
 800     } else {
 801       // Otherwise, assume it's at byte 0 of basePtr
 802       insertEltOffs = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT,
 803                                   basePtr,
 804                                   DAG.getConstant(0, PtrVT));
 805       basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT,
 806                                   basePtr,
 807                                   DAG.getConstant(0, PtrVT));
 808     }
 809   } else {
 810     // Unaligned load: must be more pessimistic about addressing modes:
 811     if (basePtr.getOpcode() == ISD::ADD) {
 812       MachineFunction &MF = DAG.getMachineFunction();
 813       MachineRegisterInfo &RegInfo = MF.getRegInfo();
 814       unsigned VReg = RegInfo.createVirtualRegister(&SPU::R32CRegClass);
 815       SDValue Flag;
 816
 817       SDValue Op0 = basePtr.getOperand(0);
 818       SDValue Op1 = basePtr.getOperand(1);
 819
 820       if (isa<ConstantSDNode>(Op1)) {
 821         // Convert the (add <ptr>, <const>) to an indirect address contained
 822         // in a register. Note that this is done because we need to avoid
 823         // creating a 0(reg) d-form address due to the SPU's block loads.
 824         basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, Op0, Op1);
 825         the_chain = DAG.getCopyToReg(the_chain, dl, VReg, basePtr, Flag);
 826         basePtr = DAG.getCopyFromReg(the_chain, dl, VReg, PtrVT);
 827       } else {
 828         // Convert the (add <arg1>, <arg2>) to an indirect address, which
 829         // will likely be lowered as a reg(reg) x-form address.
 830         basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, Op0, Op1);
 831       }
 832     } else {
 833       basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT,
 834                             basePtr,
 835                             DAG.getConstant(0, PtrVT));
 836     }
 837
 838     // Insertion point is solely determined by basePtr's contents
 839     insertEltOffs = DAG.getNode(ISD::ADD, dl, PtrVT,
 840                                 basePtr,
 841                                 DAG.getConstant(0, PtrVT));
 842   }
 843
 844   // Load the lower part of the memory to which to store.
 845   SDValue low = DAG.getLoad(vecVT, dl, the_chain, basePtr,
 846                           lowMemPtr, SN->isVolatile(), SN->isNonTemporal(), 16);
 847
 848   // if we don't need to store over the 16 byte boundary, one store suffices
 849   if (alignment >= StVT.getSizeInBits()/8) {
 850     // Update the chain
 851     the_chain = low.getValue(1);
 852
 853     LoadSDNode *LN = cast<LoadSDNode>(low);
 854     SDValue theValue = SN->getValue();
 855
 856     if (StVT != VT
 857         && (theValue.getOpcode() == ISD::AssertZext
 858             || theValue.getOpcode() == ISD::AssertSext)) {
 859       // Drill down and get the value for zero- and sign-extended
 860       // quantities
 861       theValue = theValue.getOperand(0);
 862     }
 863
 864     // If the base pointer is already a D-form address, then just create
 865     // a new D-form address with a slot offset and the orignal base pointer.
 866     // Otherwise generate a D-form address with the slot offset relative
 867     // to the stack pointer, which is always aligned.
 868 #if !defined(NDEBUG)
 869       if (DebugFlag && isCurrentDebugType(DEBUG_TYPE)) {
 870         errs() << "CellSPU LowerSTORE: basePtr = ";
 871         basePtr.getNode()->dump(&DAG);
 872         errs() << "\n";
 873       }
 874 #endif
 875
 876     SDValue insertEltOp = DAG.getNode(SPUISD::SHUFFLE_MASK, dl, vecVT,
 877                                       insertEltOffs);
 878     SDValue vectorizeOp = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, vecVT,
 879                                       theValue);
 880
 881     result = DAG.getNode(SPUISD::SHUFB, dl, vecVT,
 882                          vectorizeOp, low,
 883                          DAG.getNode(ISD::BITCAST, dl,
 884                                      MVT::v4i32, insertEltOp));
 885
 886     result = DAG.getStore(the_chain, dl, result, basePtr,
 887                           lowMemPtr,
 888                           LN->isVolatile(), LN->isNonTemporal(),
 889                           16);
 890
 891   }
 892   // do the store when it might cross the 16 byte memory access boundary.
 893   else {
 894     // TODO issue a warning if SN->isVolatile()== true? This is likely not
 895     // what the user wanted.
 896
 897     // address offset from nearest lower 16byte alinged address
 898     SDValue offset = DAG.getNode(ISD::AND, dl, MVT::i32,
 899                                     SN->getBasePtr(),
 900                                     DAG.getConstant(0xf, MVT::i32));
 901     // 16 - offset
 902     SDValue offset_compl = DAG.getNode(ISD::SUB, dl, MVT::i32,
 903                                            DAG.getConstant( 16, MVT::i32),
 904                                            offset);
 905     // 16 - sizeof(Value)
 906     SDValue surplus = DAG.getNode(ISD::SUB, dl, MVT::i32,
 907                                      DAG.getConstant( 16, MVT::i32),
 908                                      DAG.getConstant( VT.getSizeInBits()/8,
 909                                                       MVT::i32));
 910     // get a registerfull of ones
 911     SDValue ones = DAG.getConstant(-1, MVT::v4i32);
 912     ones = DAG.getNode(ISD::BITCAST, dl, MVT::i128, ones);
 913
 914     // Create the 128 bit masks that have ones where the data to store is
 915     // located.
 916     SDValue lowmask, himask;
 917     // if the value to store don't fill up the an entire 128 bits, zero
 918     // out the last bits of the mask so that only the value we want to store
 919     // is masked.
 920     // this is e.g. in the case of store i32, align 2
 921     if (!VT.isVector()){
 922       Value = DAG.getNode(SPUISD::PREFSLOT2VEC, dl, vecVT, Value);
 923       lowmask = DAG.getNode(SPUISD::SRL_BYTES, dl, MVT::i128, ones, surplus);
 924       lowmask = DAG.getNode(SPUISD::SHL_BYTES, dl, MVT::i128, lowmask,
 925                                                                surplus);
 926       Value = DAG.getNode(ISD::BITCAST, dl, MVT::i128, Value);
 927       Value = DAG.getNode(ISD::AND, dl, MVT::i128, Value, lowmask);
 928
 929     }
 930     else {
 931       lowmask = ones;
 932       Value = DAG.getNode(ISD::BITCAST, dl, MVT::i128, Value);
 933     }
 934     // this will zero, if there are no data that goes to the high quad
 935     himask = DAG.getNode(SPUISD::SHL_BYTES, dl, MVT::i128, lowmask,
 936                                                             offset_compl);
 937     lowmask = DAG.getNode(SPUISD::SRL_BYTES, dl, MVT::i128, lowmask,
 938                                                              offset);
 939
 940     // Load in the old data and zero out the parts that will be overwritten with
 941     // the new data to store.
 942     SDValue hi = DAG.getLoad(MVT::i128, dl, the_chain,
 943                                DAG.getNode(ISD::ADD, dl, PtrVT, basePtr,
 944                                            DAG.getConstant( 16, PtrVT)),
 945                                highMemPtr,
 946                                SN->isVolatile(), SN->isNonTemporal(), 16);
 947     the_chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, low.getValue(1),
 948                                                               hi.getValue(1));
 949
 950     low = DAG.getNode(ISD::AND, dl, MVT::i128,
 951                         DAG.getNode( ISD::BITCAST, dl, MVT::i128, low),
 952                         DAG.getNode( ISD::XOR, dl, MVT::i128, lowmask, ones));
 953     hi = DAG.getNode(ISD::AND, dl, MVT::i128,
 954                         DAG.getNode( ISD::BITCAST, dl, MVT::i128, hi),
 955                         DAG.getNode( ISD::XOR, dl, MVT::i128, himask, ones));
 956
 957     // Shift the Value to store into place. rlow contains the parts that go to
 958     // the lower memory chunk, rhi has the parts that go to the upper one.
 959     SDValue rlow = DAG.getNode(SPUISD::SRL_BYTES, dl, MVT::i128, Value, offset);
 960     rlow = DAG.getNode(ISD::AND, dl, MVT::i128, rlow, lowmask);
 961     SDValue rhi = DAG.getNode(SPUISD::SHL_BYTES, dl, MVT::i128, Value,
 962                                                             offset_compl);
 963
 964     // Merge the old data and the new data and store the results
 965     // Need to convert vectors here to integer as 'OR'ing floats assert
 966     rlow = DAG.getNode(ISD::OR, dl, MVT::i128,
 967                           DAG.getNode(ISD::BITCAST, dl, MVT::i128, low),
 968                           DAG.getNode(ISD::BITCAST, dl, MVT::i128, rlow));
 969     rhi = DAG.getNode(ISD::OR, dl, MVT::i128,
 970                          DAG.getNode(ISD::BITCAST, dl, MVT::i128, hi),
 971                          DAG.getNode(ISD::BITCAST, dl, MVT::i128, rhi));
 972
 973     low = DAG.getStore(the_chain, dl, rlow, basePtr,
 974                           lowMemPtr,
 975                           SN->isVolatile(), SN->isNonTemporal(), 16);
 976     hi  = DAG.getStore(the_chain, dl, rhi,
 977                             DAG.getNode(ISD::ADD, dl, PtrVT, basePtr,
 978                                         DAG.getConstant( 16, PtrVT)),
 979                             highMemPtr,
 980                             SN->isVolatile(), SN->isNonTemporal(), 16);
 981     result = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, low.getValue(0),
 982                                                            hi.getValue(0));
 983   }
 984
 985   return result;
 986 }
 987
 988 //! Generate the address of a constant pool entry.
 989 static SDValue
 990 LowerConstantPool(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) {
 991   EVT PtrVT = Op.getValueType();
 992   ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
 993   const Constant *C = CP->getConstVal();
 994   SDValue CPI = DAG.getTargetConstantPool(C, PtrVT, CP->getAlignment());
 995   SDValue Zero = DAG.getConstant(0, PtrVT);
 996   const TargetMachine &TM = DAG.getTarget();
 997   // FIXME there is no actual debug info here
 998   DebugLoc dl = Op.getDebugLoc();
 999
1000   if (TM.getRelocationModel() == Reloc::Static) {
1001     if (!ST->usingLargeMem()) {
1002       // Just return the SDValue with the constant pool address in it.
1003       return DAG.getNode(SPUISD::AFormAddr, dl, PtrVT, CPI, Zero);
1004     } else {
1005       SDValue Hi = DAG.getNode(SPUISD::Hi, dl, PtrVT, CPI, Zero);
1006       SDValue Lo = DAG.getNode(SPUISD::Lo, dl, PtrVT, CPI, Zero);
1007       return DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, Hi, Lo);
1008     }
1009   }
1010
1011   llvm_unreachable("LowerConstantPool: Relocation model other than static"
1012                    " not supported.");
1013   return SDValue();
1014 }
1015
1016 //! Alternate entry point for generating the address of a constant pool entry
1017 SDValue
1018 SPU::LowerConstantPool(SDValue Op, SelectionDAG &DAG, const SPUTargetMachine &TM) {
1019   return ::LowerConstantPool(Op, DAG, TM.getSubtargetImpl());
1020 }
1021
1022 static SDValue
1023 LowerJumpTable(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) {
1024   EVT PtrVT = Op.getValueType();
1025   JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
1026   SDValue JTI = DAG.getTargetJumpTable(JT->getIndex(), PtrVT);
1027   SDValue Zero = DAG.getConstant(0, PtrVT);
1028   const TargetMachine &TM = DAG.getTarget();
1029   // FIXME there is no actual debug info here
1030   DebugLoc dl = Op.getDebugLoc();
1031
1032   if (TM.getRelocationModel() == Reloc::Static) {
1033     if (!ST->usingLargeMem()) {
1034       return DAG.getNode(SPUISD::AFormAddr, dl, PtrVT, JTI, Zero);
1035     } else {
1036       SDValue Hi = DAG.getNode(SPUISD::Hi, dl, PtrVT, JTI, Zero);
1037       SDValue Lo = DAG.getNode(SPUISD::Lo, dl, PtrVT, JTI, Zero);
1038       return DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, Hi, Lo);
1039     }
1040   }
1041
1042   llvm_unreachable("LowerJumpTable: Relocation model other than static"
1043                    " not supported.");
1044   return SDValue();
1045 }
1046
1047 static SDValue
1048 LowerGlobalAddress(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) {
1049   EVT PtrVT = Op.getValueType();
1050   GlobalAddressSDNode *GSDN = cast<GlobalAddressSDNode>(Op);
1051   const GlobalValue *GV = GSDN->getGlobal();
1052   SDValue GA = DAG.getTargetGlobalAddress(GV, Op.getDebugLoc(),
1053                                           PtrVT, GSDN->getOffset());
1054   const TargetMachine &TM = DAG.getTarget();
1055   SDValue Zero = DAG.getConstant(0, PtrVT);
1056   // FIXME there is no actual debug info here
1057   DebugLoc dl = Op.getDebugLoc();
1058
1059   if (TM.getRelocationModel() == Reloc::Static) {
1060     if (!ST->usingLargeMem()) {
1061       return DAG.getNode(SPUISD::AFormAddr, dl, PtrVT, GA, Zero);
1062     } else {
1063       SDValue Hi = DAG.getNode(SPUISD::Hi, dl, PtrVT, GA, Zero);
1064       SDValue Lo = DAG.getNode(SPUISD::Lo, dl, PtrVT, GA, Zero);
1065       return DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, Hi, Lo);
1066     }
1067   } else {
1068     report_fatal_error("LowerGlobalAddress: Relocation model other than static"
1069                       "not supported.");
1070     /*NOTREACHED*/
1071   }
1072
1073   return SDValue();
1074 }
1075
1076 //! Custom lower double precision floating point constants
1077 static SDValue
1078 LowerConstantFP(SDValue Op, SelectionDAG &DAG) {
1079   EVT VT = Op.getValueType();
1080   // FIXME there is no actual debug info here
1081   DebugLoc dl = Op.getDebugLoc();
1082
1083   if (VT == MVT::f64) {
1084     ConstantFPSDNode *FP = cast<ConstantFPSDNode>(Op.getNode());
1085
1086     assert((FP != 0) &&
1087            "LowerConstantFP: Node is not ConstantFPSDNode");
1088
1089     uint64_t dbits = DoubleToBits(FP->getValueAPF().convertToDouble());
1090     SDValue T = DAG.getConstant(dbits, MVT::i64);
1091     SDValue Tvec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i64, T, T);
1092     return DAG.getNode(SPUISD::VEC2PREFSLOT, dl, VT,
1093                        DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Tvec));
1094   }
1095
1096   return SDValue();
1097 }
1098
1099 SDValue
1100 SPUTargetLowering::LowerFormalArguments(SDValue Chain,
1101                                         CallingConv::ID CallConv, bool isVarArg,
1102                                         const SmallVectorImpl<ISD::InputArg>
1103                                           &Ins,
1104                                         DebugLoc dl, SelectionDAG &DAG,
1105                                         SmallVectorImpl<SDValue> &InVals)
1106                                           const {
1107
1108   MachineFunction &MF = DAG.getMachineFunction();
1109   MachineFrameInfo *MFI = MF.getFrameInfo();
1110   MachineRegisterInfo &RegInfo = MF.getRegInfo();
1111   SPUFunctionInfo *FuncInfo = MF.getInfo<SPUFunctionInfo>();
1112
1113   unsigned ArgOffset = SPUFrameLowering::minStackSize();
1114   unsigned ArgRegIdx = 0;
1115   unsigned StackSlotSize = SPUFrameLowering::stackSlotSize();
1116
1117   EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
1118
1119   SmallVector<CCValAssign, 16> ArgLocs;
1120   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
1121                  getTargetMachine(), ArgLocs, *DAG.getContext());
1122   // FIXME: allow for other calling conventions
1123   CCInfo.AnalyzeFormalArguments(Ins, CCC_SPU);
1124
1125   // Add DAG nodes to load the arguments or copy them out of registers.
1126   for (unsigned ArgNo = 0, e = Ins.size(); ArgNo != e; ++ArgNo) {
1127     EVT ObjectVT = Ins[ArgNo].VT;
1128     unsigned ObjSize = ObjectVT.getSizeInBits()/8;
1129     SDValue ArgVal;
1130     CCValAssign &VA = ArgLocs[ArgNo];
1131
1132     if (VA.isRegLoc()) {
1133       const TargetRegisterClass *ArgRegClass;
1134
1135       switch (ObjectVT.getSimpleVT().SimpleTy) {
1136       default:
1137         report_fatal_error("LowerFormalArguments Unhandled argument type: " +
1138                            Twine(ObjectVT.getEVTString()));
1139       case MVT::i8:
1140         ArgRegClass = &SPU::R8CRegClass;
1141         break;
1142       case MVT::i16:
1143         ArgRegClass = &SPU::R16CRegClass;
1144         break;
1145       case MVT::i32:
1146         ArgRegClass = &SPU::R32CRegClass;
1147         break;
1148       case MVT::i64:
1149         ArgRegClass = &SPU::R64CRegClass;
1150         break;
1151       case MVT::i128:
1152         ArgRegClass = &SPU::GPRCRegClass;
1153         break;
1154       case MVT::f32:
1155         ArgRegClass = &SPU::R32FPRegClass;
1156         break;
1157       case MVT::f64:
1158         ArgRegClass = &SPU::R64FPRegClass;
1159         break;
1160       case MVT::v2f64:
1161       case MVT::v4f32:
1162       case MVT::v2i64:
1163       case MVT::v4i32:
1164       case MVT::v8i16:
1165       case MVT::v16i8:
1166         ArgRegClass = &SPU::VECREGRegClass;
1167         break;
1168       }
1169
1170       unsigned VReg = RegInfo.createVirtualRegister(ArgRegClass);
1171       RegInfo.addLiveIn(VA.getLocReg(), VReg);
1172       ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, ObjectVT);
1173       ++ArgRegIdx;
1174     } else {
1175       // We need to load the argument to a virtual register if we determined
1176       // above that we ran out of physical registers of the appropriate type
1177       // or we're forced to do vararg
1178       int FI = MFI->CreateFixedObject(ObjSize, ArgOffset, true);
1179       SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
1180       ArgVal = DAG.getLoad(ObjectVT, dl, Chain, FIN, MachinePointerInfo(),
1181                            false, false, 0);
1182       ArgOffset += StackSlotSize;
1183     }
1184
1185     InVals.push_back(ArgVal);
1186     // Update the chain
1187     Chain = ArgVal.getOperand(0);
1188   }
1189
1190   // vararg handling:
1191   if (isVarArg) {
1192     // FIXME: we should be able to query the argument registers from
1193     //        tablegen generated code.
1194     static const unsigned ArgRegs[] = {
1195       SPU::R3,  SPU::R4,  SPU::R5,  SPU::R6,  SPU::R7,  SPU::R8,  SPU::R9,
1196       SPU::R10, SPU::R11, SPU::R12, SPU::R13, SPU::R14, SPU::R15, SPU::R16,
1197       SPU::R17, SPU::R18, SPU::R19, SPU::R20, SPU::R21, SPU::R22, SPU::R23,
1198       SPU::R24, SPU::R25, SPU::R26, SPU::R27, SPU::R28, SPU::R29, SPU::R30,
1199       SPU::R31, SPU::R32, SPU::R33, SPU::R34, SPU::R35, SPU::R36, SPU::R37,
1200       SPU::R38, SPU::R39, SPU::R40, SPU::R41, SPU::R42, SPU::R43, SPU::R44,
1201       SPU::R45, SPU::R46, SPU::R47, SPU::R48, SPU::R49, SPU::R50, SPU::R51,
1202       SPU::R52, SPU::R53, SPU::R54, SPU::R55, SPU::R56, SPU::R57, SPU::R58,
1203       SPU::R59, SPU::R60, SPU::R61, SPU::R62, SPU::R63, SPU::R64, SPU::R65,
1204       SPU::R66, SPU::R67, SPU::R68, SPU::R69, SPU::R70, SPU::R71, SPU::R72,
1205       SPU::R73, SPU::R74, SPU::R75, SPU::R76, SPU::R77, SPU::R78, SPU::R79
1206     };
1207     // size of ArgRegs array
1208     unsigned NumArgRegs = 77;
1209
1210     // We will spill (79-3)+1 registers to the stack
1211     SmallVector<SDValue, 79-3+1> MemOps;
1212
1213     // Create the frame slot
1214     for (; ArgRegIdx != NumArgRegs; ++ArgRegIdx) {
1215       FuncInfo->setVarArgsFrameIndex(
1216         MFI->CreateFixedObject(StackSlotSize, ArgOffset, true));
1217       SDValue FIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
1218       unsigned VReg = MF.addLiveIn(ArgRegs[ArgRegIdx], &SPU::VECREGRegClass);
1219       SDValue ArgVal = DAG.getRegister(VReg, MVT::v16i8);
1220       SDValue Store = DAG.getStore(Chain, dl, ArgVal, FIN, MachinePointerInfo(),
1221                                    false, false, 0);
1222       Chain = Store.getOperand(0);
1223       MemOps.push_back(Store);
1224
1225       // Increment address by stack slot size for the next stored argument
1226       ArgOffset += StackSlotSize;
1227     }
1228     if (!MemOps.empty())
1229       Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
1230                           &MemOps[0], MemOps.size());
1231   }
1232
1233   return Chain;
1234 }
1235
1236 /// isLSAAddress - Return the immediate to use if the specified
1237 /// value is representable as a LSA address.
1238 static SDNode *isLSAAddress(SDValue Op, SelectionDAG &DAG) {
1239   ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
1240   if (!C) return 0;
1241
1242   int Addr = C->getZExtValue();
1243   if ((Addr & 3) != 0 ||  // Low 2 bits are implicitly zero.
1244       (Addr << 14 >> 14) != Addr)
1245     return 0;  // Top 14 bits have to be sext of immediate.
1246
1247   return DAG.getConstant((int)C->getZExtValue() >> 2, MVT::i32).getNode();
1248 }
1249
1250 SDValue
1251 SPUTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
1252                              CallingConv::ID CallConv, bool isVarArg,
1253                              bool &isTailCall,
1254                              const SmallVectorImpl<ISD::OutputArg> &Outs,
1255                              const SmallVectorImpl<SDValue> &OutVals,
1256                              const SmallVectorImpl<ISD::InputArg> &Ins,
1257                              DebugLoc dl, SelectionDAG &DAG,
1258                              SmallVectorImpl<SDValue> &InVals) const {
1259   // CellSPU target does not yet support tail call optimization.
1260   isTailCall = false;
1261
1262   const SPUSubtarget *ST = SPUTM.getSubtargetImpl();
1263   unsigned NumOps     = Outs.size();
1264   unsigned StackSlotSize = SPUFrameLowering::stackSlotSize();
1265
1266   SmallVector<CCValAssign, 16> ArgLocs;
1267   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
1268                  getTargetMachine(), ArgLocs, *DAG.getContext());
1269   // FIXME: allow for other calling conventions
1270   CCInfo.AnalyzeCallOperands(Outs, CCC_SPU);
1271
1272   const unsigned NumArgRegs = ArgLocs.size();
1273
1274
1275   // Handy pointer type
1276   EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
1277
1278   // Set up a copy of the stack pointer for use loading and storing any
1279   // arguments that may not fit in the registers available for argument
1280   // passing.
1281   SDValue StackPtr = DAG.getRegister(SPU::R1, MVT::i32);
1282
1283   // Figure out which arguments are going to go in registers, and which in
1284   // memory.
1285   unsigned ArgOffset = SPUFrameLowering::minStackSize(); // Just below [LR]
1286   unsigned ArgRegIdx = 0;
1287
1288   // Keep track of registers passing arguments
1289   std::vector<std::pair<unsigned, SDValue> > RegsToPass;
1290   // And the arguments passed on the stack
1291   SmallVector<SDValue, 8> MemOpChains;
1292
1293   for (; ArgRegIdx != NumOps; ++ArgRegIdx) {
1294     SDValue Arg = OutVals[ArgRegIdx];
1295     CCValAssign &VA = ArgLocs[ArgRegIdx];
1296
1297     // PtrOff will be used to store the current argument to the stack if a
1298     // register cannot be found for it.
1299     SDValue PtrOff = DAG.getConstant(ArgOffset, StackPtr.getValueType());
1300     PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, PtrOff);
1301
1302     switch (Arg.getValueType().getSimpleVT().SimpleTy) {
1303     default: llvm_unreachable("Unexpected ValueType for argument!");
1304     case MVT::i8:
1305     case MVT::i16:
1306     case MVT::i32:
1307     case MVT::i64:
1308     case MVT::i128:
1309     case MVT::f32:
1310     case MVT::f64:
1311     case MVT::v2i64:
1312     case MVT::v2f64:
1313     case MVT::v4f32:
1314     case MVT::v4i32:
1315     case MVT::v8i16:
1316     case MVT::v16i8:
1317       if (ArgRegIdx != NumArgRegs) {
1318         RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
1319       } else {
1320         MemOpChains.push_back(DAG.getStore(Chain, dl, Arg, PtrOff,
1321                                            MachinePointerInfo(),
1322                                            false, false, 0));
1323         ArgOffset += StackSlotSize;
1324       }
1325       break;
1326     }
1327   }
1328
1329   // Accumulate how many bytes are to be pushed on the stack, including the
1330   // linkage area, and parameter passing area.  According to the SPU ABI,
1331   // we minimally need space for [LR] and [SP].
1332   unsigned NumStackBytes = ArgOffset - SPUFrameLowering::minStackSize();
1333
1334   // Insert a call sequence start
1335   Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumStackBytes,
1336                                                             true));
1337
1338   if (!MemOpChains.empty()) {
1339     // Adjust the stack pointer for the stack arguments.
1340     Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
1341                         &MemOpChains[0], MemOpChains.size());
1342   }
1343
1344   // Build a sequence of copy-to-reg nodes chained together with token chain
1345   // and flag operands which copy the outgoing args into the appropriate regs.
1346   SDValue InFlag;
1347   for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
1348     Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
1349                              RegsToPass[i].second, InFlag);
1350     InFlag = Chain.getValue(1);
1351   }
1352
1353   SmallVector<SDValue, 8> Ops;
1354   unsigned CallOpc = SPUISD::CALL;
1355
1356   // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
1357   // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
1358   // node so that legalize doesn't hack it.
1359   if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
1360     const GlobalValue *GV = G->getGlobal();
1361     EVT CalleeVT = Callee.getValueType();
1362     SDValue Zero = DAG.getConstant(0, PtrVT);
1363     SDValue GA = DAG.getTargetGlobalAddress(GV, dl, CalleeVT);
1364
1365     if (!ST->usingLargeMem()) {
1366       // Turn calls to targets that are defined (i.e., have bodies) into BRSL
1367       // style calls, otherwise, external symbols are BRASL calls. This assumes
1368       // that declared/defined symbols are in the same compilation unit and can
1369       // be reached through PC-relative jumps.
1370       //
1371       // NOTE:
1372       // This may be an unsafe assumption for JIT and really large compilation
1373       // units.
1374       if (GV->isDeclaration()) {
1375         Callee = DAG.getNode(SPUISD::AFormAddr, dl, CalleeVT, GA, Zero);
1376       } else {
1377         Callee = DAG.getNode(SPUISD::PCRelAddr, dl, CalleeVT, GA, Zero);
1378       }
1379     } else {
1380       // "Large memory" mode: Turn all calls into indirect calls with a X-form
1381       // address pairs:
1382       Callee = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, GA, Zero);
1383     }
1384   } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
1385     EVT CalleeVT = Callee.getValueType();
1386     SDValue Zero = DAG.getConstant(0, PtrVT);
1387     SDValue ExtSym = DAG.getTargetExternalSymbol(S->getSymbol(),
1388         Callee.getValueType());
1389
1390     if (!ST->usingLargeMem()) {
1391       Callee = DAG.getNode(SPUISD::AFormAddr, dl, CalleeVT, ExtSym, Zero);
1392     } else {
1393       Callee = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, ExtSym, Zero);
1394     }
1395   } else if (SDNode *Dest = isLSAAddress(Callee, DAG)) {
1396     // If this is an absolute destination address that appears to be a legal
1397     // local store address, use the munged value.
1398     Callee = SDValue(Dest, 0);
1399   }
1400
1401   Ops.push_back(Chain);
1402   Ops.push_back(Callee);
1403
1404   // Add argument registers to the end of the list so that they are known live
1405   // into the call.
1406   for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
1407     Ops.push_back(DAG.getRegister(RegsToPass[i].first,
1408                                   RegsToPass[i].second.getValueType()));
1409
1410   if (InFlag.getNode())
1411     Ops.push_back(InFlag);
1412   // Returns a chain and a flag for retval copy to use.
1413   Chain = DAG.getNode(CallOpc, dl, DAG.getVTList(MVT::Other, MVT::Glue),
1414                       &Ops[0], Ops.size());
1415   InFlag = Chain.getValue(1);
1416
1417   Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumStackBytes, true),
1418                              DAG.getIntPtrConstant(0, true), InFlag);
1419   if (!Ins.empty())
1420     InFlag = Chain.getValue(1);
1421
1422   // If the function returns void, just return the chain.
1423   if (Ins.empty())
1424     return Chain;
1425
1426   // Now handle the return value(s)
1427   SmallVector<CCValAssign, 16> RVLocs;
1428   CCState CCRetInfo(CallConv, isVarArg, DAG.getMachineFunction(),
1429                     getTargetMachine(), RVLocs, *DAG.getContext());
1430   CCRetInfo.AnalyzeCallResult(Ins, CCC_SPU);
1431
1432
1433   // If the call has results, copy the values out of the ret val registers.
1434   for (unsigned i = 0; i != RVLocs.size(); ++i) {
1435     CCValAssign VA = RVLocs[i];
1436
1437     SDValue Val = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), VA.getLocVT(),
1438                                      InFlag);
1439     Chain = Val.getValue(1);
1440     InFlag = Val.getValue(2);
1441     InVals.push_back(Val);
1442    }
1443
1444   return Chain;
1445 }
1446
1447 SDValue
1448 SPUTargetLowering::LowerReturn(SDValue Chain,
1449                                CallingConv::ID CallConv, bool isVarArg,
1450                                const SmallVectorImpl<ISD::OutputArg> &Outs,
1451                                const SmallVectorImpl<SDValue> &OutVals,
1452                                DebugLoc dl, SelectionDAG &DAG) const {
1453
1454   SmallVector<CCValAssign, 16> RVLocs;
1455   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
1456                  getTargetMachine(), RVLocs, *DAG.getContext());
1457   CCInfo.AnalyzeReturn(Outs, RetCC_SPU);
1458
1459   // If this is the first return lowered for this function, add the regs to the
1460   // liveout set for the function.
1461   if (DAG.getMachineFunction().getRegInfo().liveout_empty()) {
1462     for (unsigned i = 0; i != RVLocs.size(); ++i)
1463       DAG.getMachineFunction().getRegInfo().addLiveOut(RVLocs[i].getLocReg());
1464   }
1465
1466   SDValue Flag;
1467
1468   // Copy the result values into the output registers.
1469   for (unsigned i = 0; i != RVLocs.size(); ++i) {
1470     CCValAssign &VA = RVLocs[i];
1471     assert(VA.isRegLoc() && "Can only return in registers!");
1472     Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
1473                              OutVals[i], Flag);
1474     Flag = Chain.getValue(1);
1475   }
1476
1477   if (Flag.getNode())
1478     return DAG.getNode(SPUISD::RET_FLAG, dl, MVT::Other, Chain, Flag);
1479   else
1480     return DAG.getNode(SPUISD::RET_FLAG, dl, MVT::Other, Chain);
1481 }
1482
1483
1484 //===----------------------------------------------------------------------===//
1485 // Vector related lowering:
1486 //===----------------------------------------------------------------------===//
1487
1488 static ConstantSDNode *
1489 getVecImm(SDNode *N) {
1490   SDValue OpVal(0, 0);
1491
1492   // Check to see if this buildvec has a single non-undef value in its elements.
1493   for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
1494     if (N->getOperand(i).getOpcode() == ISD::UNDEF) continue;
1495     if (OpVal.getNode() == 0)
1496       OpVal = N->getOperand(i);
1497     else if (OpVal != N->getOperand(i))
1498       return 0;
1499   }
1500
1501   if (OpVal.getNode() != 0) {
1502     if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(OpVal)) {
1503       return CN;
1504     }
1505   }
1506
1507   return 0;
1508 }
1509
1510 /// get_vec_i18imm - Test if this vector is a vector filled with the same value
1511 /// and the value fits into an unsigned 18-bit constant, and if so, return the
1512 /// constant
1513 SDValue SPU::get_vec_u18imm(SDNode *N, SelectionDAG &DAG,
1514                               EVT ValueType) {
1515   if (ConstantSDNode *CN = getVecImm(N)) {
1516     uint64_t Value = CN->getZExtValue();
1517     if (ValueType == MVT::i64) {
1518       uint64_t UValue = CN->getZExtValue();
1519       uint32_t upper = uint32_t(UValue >> 32);
1520       uint32_t lower = uint32_t(UValue);
1521       if (upper != lower)
1522         return SDValue();
1523       Value = Value >> 32;
1524     }
1525     if (Value <= 0x3ffff)
1526       return DAG.getTargetConstant(Value, ValueType);
1527   }
1528
1529   return SDValue();
1530 }
1531
1532 /// get_vec_i16imm - Test if this vector is a vector filled with the same value
1533 /// and the value fits into a signed 16-bit constant, and if so, return the
1534 /// constant
1535 SDValue SPU::get_vec_i16imm(SDNode *N, SelectionDAG &DAG,
1536                               EVT ValueType) {
1537   if (ConstantSDNode *CN = getVecImm(N)) {
1538     int64_t Value = CN->getSExtValue();
1539     if (ValueType == MVT::i64) {
1540       uint64_t UValue = CN->getZExtValue();
1541       uint32_t upper = uint32_t(UValue >> 32);
1542       uint32_t lower = uint32_t(UValue);
1543       if (upper != lower)
1544         return SDValue();
1545       Value = Value >> 32;
1546     }
1547     if (Value >= -(1 << 15) && Value <= ((1 << 15) - 1)) {
1548       return DAG.getTargetConstant(Value, ValueType);
1549     }
1550   }
1551
1552   return SDValue();
1553 }
1554
1555 /// get_vec_i10imm - Test if this vector is a vector filled with the same value
1556 /// and the value fits into a signed 10-bit constant, and if so, return the
1557 /// constant
1558 SDValue SPU::get_vec_i10imm(SDNode *N, SelectionDAG &DAG,
1559                               EVT ValueType) {
1560   if (ConstantSDNode *CN = getVecImm(N)) {
1561     int64_t Value = CN->getSExtValue();
1562     if (ValueType == MVT::i64) {
1563       uint64_t UValue = CN->getZExtValue();
1564       uint32_t upper = uint32_t(UValue >> 32);
1565       uint32_t lower = uint32_t(UValue);
1566       if (upper != lower)
1567         return SDValue();
1568       Value = Value >> 32;
1569     }
1570     if (isInt<10>(Value))
1571       return DAG.getTargetConstant(Value, ValueType);
1572   }
1573
1574   return SDValue();
1575 }
1576
1577 /// get_vec_i8imm - Test if this vector is a vector filled with the same value
1578 /// and the value fits into a signed 8-bit constant, and if so, return the
1579 /// constant.
1580 ///
1581 /// @note: The incoming vector is v16i8 because that's the only way we can load
1582 /// constant vectors. Thus, we test to see if the upper and lower bytes are the
1583 /// same value.
1584 SDValue SPU::get_vec_i8imm(SDNode *N, SelectionDAG &DAG,
1585                              EVT ValueType) {
1586   if (ConstantSDNode *CN = getVecImm(N)) {
1587     int Value = (int) CN->getZExtValue();
1588     if (ValueType == MVT::i16
1589         && Value <= 0xffff                 /* truncated from uint64_t */
1590         && ((short) Value >> 8) == ((short) Value & 0xff))
1591       return DAG.getTargetConstant(Value & 0xff, ValueType);
1592     else if (ValueType == MVT::i8
1593              && (Value & 0xff) == Value)
1594       return DAG.getTargetConstant(Value, ValueType);
1595   }
1596
1597   return SDValue();
1598 }
1599
1600 /// get_ILHUvec_imm - Test if this vector is a vector filled with the same value
1601 /// and the value fits into a signed 16-bit constant, and if so, return the
1602 /// constant
1603 SDValue SPU::get_ILHUvec_imm(SDNode *N, SelectionDAG &DAG,
1604                                EVT ValueType) {
1605   if (ConstantSDNode *CN = getVecImm(N)) {
1606     uint64_t Value = CN->getZExtValue();
1607     if ((ValueType == MVT::i32
1608           && ((unsigned) Value & 0xffff0000) == (unsigned) Value)
1609         || (ValueType == MVT::i64 && (Value & 0xffff0000) == Value))
1610       return DAG.getTargetConstant(Value >> 16, ValueType);
1611   }
1612
1613   return SDValue();
1614 }
1615
1616 /// get_v4i32_imm - Catch-all for general 32-bit constant vectors
1617 SDValue SPU::get_v4i32_imm(SDNode *N, SelectionDAG &DAG) {
1618   if (ConstantSDNode *CN = getVecImm(N)) {
1619     return DAG.getTargetConstant((unsigned) CN->getZExtValue(), MVT::i32);
1620   }
1621
1622   return SDValue();
1623 }
1624
1625 /// get_v4i32_imm - Catch-all for general 64-bit constant vectors
1626 SDValue SPU::get_v2i64_imm(SDNode *N, SelectionDAG &DAG) {
1627   if (ConstantSDNode *CN = getVecImm(N)) {
1628     return DAG.getTargetConstant((unsigned) CN->getZExtValue(), MVT::i64);
1629   }
1630
1631   return SDValue();
1632 }
1633
1634 //! Lower a BUILD_VECTOR instruction creatively:
1635 static SDValue
1636 LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) {
1637   EVT VT = Op.getValueType();
1638   EVT EltVT = VT.getVectorElementType();
1639   DebugLoc dl = Op.getDebugLoc();
1640   BuildVectorSDNode *BCN = dyn_cast<BuildVectorSDNode>(Op.getNode());
1641   assert(BCN != 0 && "Expected BuildVectorSDNode in SPU LowerBUILD_VECTOR");
1642   unsigned minSplatBits = EltVT.getSizeInBits();
1643
1644   if (minSplatBits < 16)
1645     minSplatBits = 16;
1646
1647   APInt APSplatBits, APSplatUndef;
1648   unsigned SplatBitSize;
1649   bool HasAnyUndefs;
1650
1651   if (!BCN->isConstantSplat(APSplatBits, APSplatUndef, SplatBitSize,
1652                             HasAnyUndefs, minSplatBits)
1653       || minSplatBits < SplatBitSize)
1654     return SDValue();   // Wasn't a constant vector or splat exceeded min
1655
1656   uint64_t SplatBits = APSplatBits.getZExtValue();
1657
1658   switch (VT.getSimpleVT().SimpleTy) {
1659   default:
1660     report_fatal_error("CellSPU: Unhandled VT in LowerBUILD_VECTOR, VT = " +
1661                        Twine(VT.getEVTString()));
1662     /*NOTREACHED*/
1663   case MVT::v4f32: {
1664     uint32_t Value32 = uint32_t(SplatBits);
1665     assert(SplatBitSize == 32
1666            && "LowerBUILD_VECTOR: Unexpected floating point vector element.");
1667     // NOTE: pretend the constant is an integer. LLVM won't load FP constants
1668     SDValue T = DAG.getConstant(Value32, MVT::i32);
1669     return DAG.getNode(ISD::BITCAST, dl, MVT::v4f32,
1670                        DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, T,T,T,T));
1671     break;
1672   }
1673   case MVT::v2f64: {
1674     uint64_t f64val = uint64_t(SplatBits);
1675     assert(SplatBitSize == 64
1676            && "LowerBUILD_VECTOR: 64-bit float vector size > 8 bytes.");
1677     // NOTE: pretend the constant is an integer. LLVM won't load FP constants
1678     SDValue T = DAG.getConstant(f64val, MVT::i64);
1679     return DAG.getNode(ISD::BITCAST, dl, MVT::v2f64,
1680                        DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i64, T, T));
1681     break;
1682   }
1683   case MVT::v16i8: {
1684    // 8-bit constants have to be expanded to 16-bits
1685    unsigned short Value16 = SplatBits /* | (SplatBits << 8) */;
1686    SmallVector<SDValue, 8> Ops;
1687
1688    Ops.assign(8, DAG.getConstant(Value16, MVT::i16));
1689    return DAG.getNode(ISD::BITCAST, dl, VT,
1690                       DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i16, &Ops[0], Ops.size()));
1691   }
1692   case MVT::v8i16: {
1693     unsigned short Value16 = SplatBits;
1694     SDValue T = DAG.getConstant(Value16, EltVT);
1695     SmallVector<SDValue, 8> Ops;
1696
1697     Ops.assign(8, T);
1698     return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &Ops[0], Ops.size());
1699   }
1700   case MVT::v4i32: {
1701     SDValue T = DAG.getConstant(unsigned(SplatBits), VT.getVectorElementType());
1702     return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, T, T, T, T);
1703   }
1704   case MVT::v2i64: {
1705     return SPU::LowerV2I64Splat(VT, DAG, SplatBits, dl);
1706   }
1707   }
1708
1709   return SDValue();
1710 }
1711
1712 /*!
1713  */
1714 SDValue
1715 SPU::LowerV2I64Splat(EVT OpVT, SelectionDAG& DAG, uint64_t SplatVal,
1716                      DebugLoc dl) {
1717   uint32_t upper = uint32_t(SplatVal >> 32);
1718   uint32_t lower = uint32_t(SplatVal);
1719
1720   if (upper == lower) {
1721     // Magic constant that can be matched by IL, ILA, et. al.
1722     SDValue Val = DAG.getTargetConstant(upper, MVT::i32);
1723     return DAG.getNode(ISD::BITCAST, dl, OpVT,
1724                        DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
1725                                    Val, Val, Val, Val));
1726   } else {
1727     bool upper_special, lower_special;
1728
1729     // NOTE: This code creates common-case shuffle masks that can be easily
1730     // detected as common expressions. It is not attempting to create highly
1731     // specialized masks to replace any and all 0's, 0xff's and 0x80's.
1732
1733     // Detect if the upper or lower half is a special shuffle mask pattern:
1734     upper_special = (upper == 0 || upper == 0xffffffff || upper == 0x80000000);
1735     lower_special = (lower == 0 || lower == 0xffffffff || lower == 0x80000000);
1736
1737     // Both upper and lower are special, lower to a constant pool load:
1738     if (lower_special && upper_special) {
1739       SDValue SplatValCN = DAG.getConstant(SplatVal, MVT::i64);
1740       return DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i64,
1741                          SplatValCN, SplatValCN);
1742     }
1743
1744     SDValue LO32;
1745     SDValue HI32;
1746     SmallVector<SDValue, 16> ShufBytes;
1747     SDValue Result;
1748
1749     // Create lower vector if not a special pattern
1750     if (!lower_special) {
1751       SDValue LO32C = DAG.getConstant(lower, MVT::i32);
1752       LO32 = DAG.getNode(ISD::BITCAST, dl, OpVT,
1753                          DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
1754                                      LO32C, LO32C, LO32C, LO32C));
1755     }
1756
1757     // Create upper vector if not a special pattern
1758     if (!upper_special) {
1759       SDValue HI32C = DAG.getConstant(upper, MVT::i32);
1760       HI32 = DAG.getNode(ISD::BITCAST, dl, OpVT,
1761                          DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
1762                                      HI32C, HI32C, HI32C, HI32C));
1763     }
1764
1765     // If either upper or lower are special, then the two input operands are
1766     // the same (basically, one of them is a "don't care")
1767     if (lower_special)
1768       LO32 = HI32;
1769     if (upper_special)
1770       HI32 = LO32;
1771
1772     for (int i = 0; i < 4; ++i) {
1773       uint64_t val = 0;
1774       for (int j = 0; j < 4; ++j) {
1775         SDValue V;
1776         bool process_upper, process_lower;
1777         val <<= 8;
1778         process_upper = (upper_special && (i & 1) == 0);
1779         process_lower = (lower_special && (i & 1) == 1);
1780
1781         if (process_upper || process_lower) {
1782           if ((process_upper && upper == 0)
1783                   || (process_lower && lower == 0))
1784             val |= 0x80;
1785           else if ((process_upper && upper == 0xffffffff)
1786                   || (process_lower && lower == 0xffffffff))
1787             val |= 0xc0;
1788           else if ((process_upper && upper == 0x80000000)
1789                   || (process_lower && lower == 0x80000000))
1790             val |= (j == 0 ? 0xe0 : 0x80);
1791         } else
1792           val |= i * 4 + j + ((i & 1) * 16);
1793       }
1794
1795       ShufBytes.push_back(DAG.getConstant(val, MVT::i32));
1796     }
1797
1798     return DAG.getNode(SPUISD::SHUFB, dl, OpVT, HI32, LO32,
1799                        DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
1800                                    &ShufBytes[0], ShufBytes.size()));
1801   }
1802 }
1803
1804 /// LowerVECTOR_SHUFFLE - Lower a vector shuffle (V1, V2, V3) to something on
1805 /// which the Cell can operate. The code inspects V3 to ascertain whether the
1806 /// permutation vector, V3, is monotonically increasing with one "exception"
1807 /// element, e.g., (0, 1, _, 3). If this is the case, then generate a
1808 /// SHUFFLE_MASK synthetic instruction. Otherwise, spill V3 to the constant pool.
1809 /// In either case, the net result is going to eventually invoke SHUFB to
1810 /// permute/shuffle the bytes from V1 and V2.
1811 /// \note
1812 /// SHUFFLE_MASK is eventually selected as one of the C*D instructions, generate
1813 /// control word for byte/halfword/word insertion. This takes care of a single
1814 /// element move from V2 into V1.
1815 /// \note
1816 /// SPUISD::SHUFB is eventually selected as Cell's <i>shufb</i> instructions.
1817 static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) {
1818   const ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op);
1819   SDValue V1 = Op.getOperand(0);
1820   SDValue V2 = Op.getOperand(1);
1821   DebugLoc dl = Op.getDebugLoc();
1822
1823   if (V2.getOpcode() == ISD::UNDEF) V2 = V1;
1824
1825   // If we have a single element being moved from V1 to V2, this can be handled
1826   // using the C*[DX] compute mask instructions, but the vector elements have
1827   // to be monotonically increasing with one exception element, and the source
1828   // slot of the element to move must be the same as the destination.
1829   EVT VecVT = V1.getValueType();
1830   EVT EltVT = VecVT.getVectorElementType();
1831   unsigned EltsFromV2 = 0;
1832   unsigned V2EltOffset = 0;
1833   unsigned V2EltIdx0 = 0;
1834   unsigned CurrElt = 0;
1835   unsigned MaxElts = VecVT.getVectorNumElements();
1836   unsigned PrevElt = 0;
1837   bool monotonic = true;
1838   bool rotate = true;
1839   int rotamt=0;
1840   EVT maskVT;             // which of the c?d instructions to use
1841
1842   if (EltVT == MVT::i8) {
1843     V2EltIdx0 = 16;
1844     maskVT = MVT::v16i8;
1845   } else if (EltVT == MVT::i16) {
1846     V2EltIdx0 = 8;
1847     maskVT = MVT::v8i16;
1848   } else if (EltVT == MVT::i32 || EltVT == MVT::f32) {
1849     V2EltIdx0 = 4;
1850     maskVT = MVT::v4i32;
1851   } else if (EltVT == MVT::i64 || EltVT == MVT::f64) {
1852     V2EltIdx0 = 2;
1853     maskVT = MVT::v2i64;
1854   } else
1855     llvm_unreachable("Unhandled vector type in LowerVECTOR_SHUFFLE");
1856
1857   for (unsigned i = 0; i != MaxElts; ++i) {
1858     if (SVN->getMaskElt(i) < 0)
1859       continue;
1860
1861     unsigned SrcElt = SVN->getMaskElt(i);
1862
1863     if (monotonic) {
1864       if (SrcElt >= V2EltIdx0) {
1865         // TODO: optimize for the monotonic case when several consecutive
1866         // elements are taken form V2. Do we ever get such a case?
1867         if (EltsFromV2 == 0 && CurrElt == (SrcElt - V2EltIdx0))
1868           V2EltOffset = (SrcElt - V2EltIdx0) * (EltVT.getSizeInBits()/8);
1869         else
1870           monotonic = false;
1871         ++EltsFromV2;
1872       } else if (CurrElt != SrcElt) {
1873         monotonic = false;
1874       }
1875
1876       ++CurrElt;
1877     }
1878
1879     if (rotate) {
1880       if (PrevElt > 0 && SrcElt < MaxElts) {
1881         if ((PrevElt == SrcElt - 1)
1882             || (PrevElt == MaxElts - 1 && SrcElt == 0)) {
1883           PrevElt = SrcElt;
1884         } else {
1885           rotate = false;
1886         }
1887       } else if (i == 0 || (PrevElt==0 && SrcElt==1)) {
1888         // First time or after a "wrap around"
1889         rotamt = SrcElt-i;
1890         PrevElt = SrcElt;
1891       } else {
1892         // This isn't a rotation, takes elements from vector 2
1893         rotate = false;
1894       }
1895     }
1896   }
1897
1898   if (EltsFromV2 == 1 && monotonic) {
1899     // Compute mask and shuffle
1900     EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
1901
1902     // As SHUFFLE_MASK becomes a c?d instruction, feed it an address
1903     // R1 ($sp) is used here only as it is guaranteed to have last bits zero
1904     SDValue Pointer = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT,
1905                                 DAG.getRegister(SPU::R1, PtrVT),
1906                                 DAG.getConstant(V2EltOffset, MVT::i32));
1907     SDValue ShufMaskOp = DAG.getNode(SPUISD::SHUFFLE_MASK, dl,
1908                                      maskVT, Pointer);
1909
1910     // Use shuffle mask in SHUFB synthetic instruction:
1911     return DAG.getNode(SPUISD::SHUFB, dl, V1.getValueType(), V2, V1,
1912                        ShufMaskOp);
1913   } else if (rotate) {
1914     if (rotamt < 0)
1915       rotamt +=MaxElts;
1916     rotamt *= EltVT.getSizeInBits()/8;
1917     return DAG.getNode(SPUISD::ROTBYTES_LEFT, dl, V1.getValueType(),
1918                        V1, DAG.getConstant(rotamt, MVT::i16));
1919   } else {
1920    // Convert the SHUFFLE_VECTOR mask's input element units to the
1921    // actual bytes.
1922     unsigned BytesPerElement = EltVT.getSizeInBits()/8;
1923
1924     SmallVector<SDValue, 16> ResultMask;
1925     for (unsigned i = 0, e = MaxElts; i != e; ++i) {
1926       unsigned SrcElt = SVN->getMaskElt(i) < 0 ? 0 : SVN->getMaskElt(i);
1927
1928       for (unsigned j = 0; j < BytesPerElement; ++j)
1929         ResultMask.push_back(DAG.getConstant(SrcElt*BytesPerElement+j,MVT::i8));
1930     }
1931     SDValue VPermMask = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i8,
1932                                     &ResultMask[0], ResultMask.size());
1933     return DAG.getNode(SPUISD::SHUFB, dl, V1.getValueType(), V1, V2, VPermMask);
1934   }
1935 }
1936
1937 static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) {
1938   SDValue Op0 = Op.getOperand(0);                     // Op0 = the scalar
1939   DebugLoc dl = Op.getDebugLoc();
1940
1941   if (Op0.getNode()->getOpcode() == ISD::Constant) {
1942     // For a constant, build the appropriate constant vector, which will
1943     // eventually simplify to a vector register load.
1944
1945     ConstantSDNode *CN = cast<ConstantSDNode>(Op0.getNode());
1946     SmallVector<SDValue, 16> ConstVecValues;
1947     EVT VT;
1948     size_t n_copies;
1949
1950     // Create a constant vector:
1951     switch (Op.getValueType().getSimpleVT().SimpleTy) {
1952     default: llvm_unreachable("Unexpected constant value type in "
1953                               "LowerSCALAR_TO_VECTOR");
1954     case MVT::v16i8: n_copies = 16; VT = MVT::i8; break;
1955     case MVT::v8i16: n_copies = 8; VT = MVT::i16; break;
1956     case MVT::v4i32: n_copies = 4; VT = MVT::i32; break;
1957     case MVT::v4f32: n_copies = 4; VT = MVT::f32; break;
1958     case MVT::v2i64: n_copies = 2; VT = MVT::i64; break;
1959     case MVT::v2f64: n_copies = 2; VT = MVT::f64; break;
1960     }
1961
1962     SDValue CValue = DAG.getConstant(CN->getZExtValue(), VT);
1963     for (size_t j = 0; j < n_copies; ++j)
1964       ConstVecValues.push_back(CValue);
1965
1966     return DAG.getNode(ISD::BUILD_VECTOR, dl, Op.getValueType(),
1967                        &ConstVecValues[0], ConstVecValues.size());
1968   } else {
1969     // Otherwise, copy the value from one register to another:
1970     switch (Op0.getValueType().getSimpleVT().SimpleTy) {
1971     default: llvm_unreachable("Unexpected value type in LowerSCALAR_TO_VECTOR");
1972     case MVT::i8:
1973     case MVT::i16:
1974     case MVT::i32:
1975     case MVT::i64:
1976     case MVT::f32:
1977     case MVT::f64:
1978       return DAG.getNode(SPUISD::PREFSLOT2VEC, dl, Op.getValueType(), Op0, Op0);
1979     }
1980   }
1981
1982   return SDValue();
1983 }
1984
1985 static SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) {
1986   EVT VT = Op.getValueType();
1987   SDValue N = Op.getOperand(0);
1988   SDValue Elt = Op.getOperand(1);
1989   DebugLoc dl = Op.getDebugLoc();
1990   SDValue retval;
1991
1992   if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Elt)) {
1993     // Constant argument:
1994     int EltNo = (int) C->getZExtValue();
1995
1996     // sanity checks:
1997     if (VT == MVT::i8 && EltNo >= 16)
1998       llvm_unreachable("SPU LowerEXTRACT_VECTOR_ELT: i8 extraction slot > 15");
1999     else if (VT == MVT::i16 && EltNo >= 8)
2000       llvm_unreachable("SPU LowerEXTRACT_VECTOR_ELT: i16 extraction slot > 7");
2001     else if (VT == MVT::i32 && EltNo >= 4)
2002       llvm_unreachable("SPU LowerEXTRACT_VECTOR_ELT: i32 extraction slot > 4");
2003     else if (VT == MVT::i64 && EltNo >= 2)
2004       llvm_unreachable("SPU LowerEXTRACT_VECTOR_ELT: i64 extraction slot > 2");
2005
2006     if (EltNo == 0 && (VT == MVT::i32 || VT == MVT::i64)) {
2007       // i32 and i64: Element 0 is the preferred slot
2008       return DAG.getNode(SPUISD::VEC2PREFSLOT, dl, VT, N);
2009     }
2010
2011     // Need to generate shuffle mask and extract:
2012     int prefslot_begin = -1, prefslot_end = -1;
2013     int elt_byte = EltNo * VT.getSizeInBits() / 8;
2014
2015     switch (VT.getSimpleVT().SimpleTy) {
2016     default:
2017       assert(false && "Invalid value type!");
2018     case MVT::i8: {
2019       prefslot_begin = prefslot_end = 3;
2020       break;
2021     }
2022     case MVT::i16: {
2023       prefslot_begin = 2; prefslot_end = 3;
2024       break;
2025     }
2026     case MVT::i32:
2027     case MVT::f32: {
2028       prefslot_begin = 0; prefslot_end = 3;
2029       break;
2030     }
2031     case MVT::i64:
2032     case MVT::f64: {
2033       prefslot_begin = 0; prefslot_end = 7;
2034       break;
2035     }
2036     }
2037
2038     assert(prefslot_begin != -1 && prefslot_end != -1 &&
2039            "LowerEXTRACT_VECTOR_ELT: preferred slots uninitialized");
2040
2041     unsigned int ShufBytes[16] = {
2042       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
2043     };
2044     for (int i = 0; i < 16; ++i) {
2045       // zero fill uppper part of preferred slot, don't care about the
2046       // other slots:
2047       unsigned int mask_val;
2048       if (i <= prefslot_end) {
2049         mask_val =
2050           ((i < prefslot_begin)
2051            ? 0x80
2052            : elt_byte + (i - prefslot_begin));
2053
2054         ShufBytes[i] = mask_val;
2055       } else
2056         ShufBytes[i] = ShufBytes[i % (prefslot_end + 1)];
2057     }
2058
2059     SDValue ShufMask[4];
2060     for (unsigned i = 0; i < sizeof(ShufMask)/sizeof(ShufMask[0]); ++i) {
2061       unsigned bidx = i * 4;
2062       unsigned int bits = ((ShufBytes[bidx] << 24) |
2063                            (ShufBytes[bidx+1] << 16) |
2064                            (ShufBytes[bidx+2] << 8) |
2065                            ShufBytes[bidx+3]);
2066       ShufMask[i] = DAG.getConstant(bits, MVT::i32);
2067     }
2068
2069     SDValue ShufMaskVec =
2070       DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
2071                   &ShufMask[0], sizeof(ShufMask)/sizeof(ShufMask[0]));
2072
2073     retval = DAG.getNode(SPUISD::VEC2PREFSLOT, dl, VT,
2074                          DAG.getNode(SPUISD::SHUFB, dl, N.getValueType(),
2075                                      N, N, ShufMaskVec));
2076   } else {
2077     // Variable index: Rotate the requested element into slot 0, then replicate
2078     // slot 0 across the vector
2079     EVT VecVT = N.getValueType();
2080     if (!VecVT.isSimple() || !VecVT.isVector()) {
2081       report_fatal_error("LowerEXTRACT_VECTOR_ELT: Must have a simple, 128-bit"
2082                         "vector type!");
2083     }
2084
2085     // Make life easier by making sure the index is zero-extended to i32
2086     if (Elt.getValueType() != MVT::i32)
2087       Elt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Elt);
2088
2089     // Scale the index to a bit/byte shift quantity
2090     APInt scaleFactor =
2091             APInt(32, uint64_t(16 / N.getValueType().getVectorNumElements()), false);
2092     unsigned scaleShift = scaleFactor.logBase2();
2093     SDValue vecShift;
2094
2095     if (scaleShift > 0) {
2096       // Scale the shift factor:
2097       Elt = DAG.getNode(ISD::SHL, dl, MVT::i32, Elt,
2098                         DAG.getConstant(scaleShift, MVT::i32));
2099     }
2100
2101     vecShift = DAG.getNode(SPUISD::SHL_BYTES, dl, VecVT, N, Elt);
2102
2103     // Replicate the bytes starting at byte 0 across the entire vector (for
2104     // consistency with the notion of a unified register set)
2105     SDValue replicate;
2106
2107     switch (VT.getSimpleVT().SimpleTy) {
2108     default:
2109       report_fatal_error("LowerEXTRACT_VECTOR_ELT(varable): Unhandled vector"
2110                         "type");
2111       /*NOTREACHED*/
2112     case MVT::i8: {
2113       SDValue factor = DAG.getConstant(0x00000000, MVT::i32);
2114       replicate = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
2115                               factor, factor, factor, factor);
2116       break;
2117     }
2118     case MVT::i16: {
2119       SDValue factor = DAG.getConstant(0x00010001, MVT::i32);
2120       replicate = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
2121                               factor, factor, factor, factor);
2122       break;
2123     }
2124     case MVT::i32:
2125     case MVT::f32: {
2126       SDValue factor = DAG.getConstant(0x00010203, MVT::i32);
2127       replicate = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
2128                               factor, factor, factor, factor);
2129       break;
2130     }
2131     case MVT::i64:
2132     case MVT::f64: {
2133       SDValue loFactor = DAG.getConstant(0x00010203, MVT::i32);
2134       SDValue hiFactor = DAG.getConstant(0x04050607, MVT::i32);
2135       replicate = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
2136                               loFactor, hiFactor, loFactor, hiFactor);
2137       break;
2138     }
2139     }
2140
2141     retval = DAG.getNode(SPUISD::VEC2PREFSLOT, dl, VT,
2142                          DAG.getNode(SPUISD::SHUFB, dl, VecVT,
2143                                      vecShift, vecShift, replicate));
2144   }
2145
2146   return retval;
2147 }
2148
2149 static SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) {
2150   SDValue VecOp = Op.getOperand(0);
2151   SDValue ValOp = Op.getOperand(1);
2152   SDValue IdxOp = Op.getOperand(2);
2153   DebugLoc dl = Op.getDebugLoc();
2154   EVT VT = Op.getValueType();
2155   EVT eltVT = ValOp.getValueType();
2156
2157   // use 0 when the lane to insert to is 'undef'
2158   int64_t Offset=0;
2159   if (IdxOp.getOpcode() != ISD::UNDEF) {
2160     ConstantSDNode *CN = cast<ConstantSDNode>(IdxOp);
2161     assert(CN != 0 && "LowerINSERT_VECTOR_ELT: Index is not constant!");
2162     Offset = (CN->getSExtValue()) * eltVT.getSizeInBits()/8;
2163   }
2164
2165   EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
2166   // Use $sp ($1) because it's always 16-byte aligned and it's available:
2167   SDValue Pointer = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT,
2168                                 DAG.getRegister(SPU::R1, PtrVT),
2169                                 DAG.getConstant(Offset, PtrVT));
2170   // widen the mask when dealing with half vectors
2171   EVT maskVT = EVT::getVectorVT(*(DAG.getContext()), VT.getVectorElementType(),
2172                                 128/ VT.getVectorElementType().getSizeInBits());
2173   SDValue ShufMask = DAG.getNode(SPUISD::SHUFFLE_MASK, dl, maskVT, Pointer);
2174
2175   SDValue result =
2176     DAG.getNode(SPUISD::SHUFB, dl, VT,
2177                 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, ValOp),
2178                 VecOp,
2179                 DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, ShufMask));
2180
2181   return result;
2182 }
2183
2184 static SDValue LowerI8Math(SDValue Op, SelectionDAG &DAG, unsigned Opc,
2185                            const TargetLowering &TLI)
2186 {
2187   SDValue N0 = Op.getOperand(0);      // Everything has at least one operand
2188   DebugLoc dl = Op.getDebugLoc();
2189   EVT ShiftVT = TLI.getShiftAmountTy(N0.getValueType());
2190
2191   assert(Op.getValueType() == MVT::i8);
2192   switch (Opc) {
2193   default:
2194     llvm_unreachable("Unhandled i8 math operator");
2195     /*NOTREACHED*/
2196     break;
2197   case ISD::ADD: {
2198     // 8-bit addition: Promote the arguments up to 16-bits and truncate
2199     // the result:
2200     SDValue N1 = Op.getOperand(1);
2201     N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i16, N0);
2202     N1 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i16, N1);
2203     return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8,
2204                        DAG.getNode(Opc, dl, MVT::i16, N0, N1));
2205
2206   }
2207
2208   case ISD::SUB: {
2209     // 8-bit subtraction: Promote the arguments up to 16-bits and truncate
2210     // the result:
2211     SDValue N1 = Op.getOperand(1);
2212     N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i16, N0);
2213     N1 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i16, N1);
2214     return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8,
2215                        DAG.getNode(Opc, dl, MVT::i16, N0, N1));
2216   }
2217   case ISD::ROTR:
2218   case ISD::ROTL: {
2219     SDValue N1 = Op.getOperand(1);
2220     EVT N1VT = N1.getValueType();
2221
2222     N0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, N0);
2223     if (!N1VT.bitsEq(ShiftVT)) {
2224       unsigned N1Opc = N1.getValueType().bitsLT(ShiftVT)
2225                        ? ISD::ZERO_EXTEND
2226                        : ISD::TRUNCATE;
2227       N1 = DAG.getNode(N1Opc, dl, ShiftVT, N1);
2228     }
2229
2230     // Replicate lower 8-bits into upper 8:
2231     SDValue ExpandArg =
2232       DAG.getNode(ISD::OR, dl, MVT::i16, N0,
2233                   DAG.getNode(ISD::SHL, dl, MVT::i16,
2234                               N0, DAG.getConstant(8, MVT::i32)));
2235
2236     // Truncate back down to i8
2237     return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8,
2238                        DAG.getNode(Opc, dl, MVT::i16, ExpandArg, N1));
2239   }
2240   case ISD::SRL:
2241   case ISD::SHL: {
2242     SDValue N1 = Op.getOperand(1);
2243     EVT N1VT = N1.getValueType();
2244
2245     N0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, N0);
2246     if (!N1VT.bitsEq(ShiftVT)) {
2247       unsigned N1Opc = ISD::ZERO_EXTEND;
2248
2249       if (N1.getValueType().bitsGT(ShiftVT))
2250         N1Opc = ISD::TRUNCATE;
2251
2252       N1 = DAG.getNode(N1Opc, dl, ShiftVT, N1);
2253     }
2254
2255     return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8,
2256                        DAG.getNode(Opc, dl, MVT::i16, N0, N1));
2257   }
2258   case ISD::SRA: {
2259     SDValue N1 = Op.getOperand(1);
2260     EVT N1VT = N1.getValueType();
2261
2262     N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i16, N0);
2263     if (!N1VT.bitsEq(ShiftVT)) {
2264       unsigned N1Opc = ISD::SIGN_EXTEND;
2265
2266       if (N1VT.bitsGT(ShiftVT))
2267         N1Opc = ISD::TRUNCATE;
2268       N1 = DAG.getNode(N1Opc, dl, ShiftVT, N1);
2269     }
2270
2271     return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8,
2272                        DAG.getNode(Opc, dl, MVT::i16, N0, N1));
2273   }
2274   case ISD::MUL: {
2275     SDValue N1 = Op.getOperand(1);
2276
2277     N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i16, N0);
2278     N1 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i16, N1);
2279     return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8,
2280                        DAG.getNode(Opc, dl, MVT::i16, N0, N1));
2281     break;
2282   }
2283   }
2284
2285   return SDValue();
2286 }
2287
2288 //! Lower byte immediate operations for v16i8 vectors:
2289 static SDValue
2290 LowerByteImmed(SDValue Op, SelectionDAG &DAG) {
2291   SDValue ConstVec;
2292   SDValue Arg;
2293   EVT VT = Op.getValueType();
2294   DebugLoc dl = Op.getDebugLoc();
2295
2296   ConstVec = Op.getOperand(0);
2297   Arg = Op.getOperand(1);
2298   if (ConstVec.getNode()->getOpcode() != ISD::BUILD_VECTOR) {
2299     if (ConstVec.getNode()->getOpcode() == ISD::BITCAST) {
2300       ConstVec = ConstVec.getOperand(0);
2301     } else {
2302       ConstVec = Op.getOperand(1);
2303       Arg = Op.getOperand(0);
2304       if (ConstVec.getNode()->getOpcode() == ISD::BITCAST) {
2305         ConstVec = ConstVec.getOperand(0);
2306       }
2307     }
2308   }
2309
2310   if (ConstVec.getNode()->getOpcode() == ISD::BUILD_VECTOR) {
2311     BuildVectorSDNode *BCN = dyn_cast<BuildVectorSDNode>(ConstVec.getNode());
2312     assert(BCN != 0 && "Expected BuildVectorSDNode in SPU LowerByteImmed");
2313
2314     APInt APSplatBits, APSplatUndef;
2315     unsigned SplatBitSize;
2316     bool HasAnyUndefs;
2317     unsigned minSplatBits = VT.getVectorElementType().getSizeInBits();
2318
2319     if (BCN->isConstantSplat(APSplatBits, APSplatUndef, SplatBitSize,
2320                               HasAnyUndefs, minSplatBits)
2321         && minSplatBits <= SplatBitSize) {
2322       uint64_t SplatBits = APSplatBits.getZExtValue();
2323       SDValue tc = DAG.getTargetConstant(SplatBits & 0xff, MVT::i8);
2324
2325       SmallVector<SDValue, 16> tcVec;
2326       tcVec.assign(16, tc);
2327       return DAG.getNode(Op.getNode()->getOpcode(), dl, VT, Arg,
2328                          DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &tcVec[0], tcVec.size()));
2329     }
2330   }
2331
2332   // These operations (AND, OR, XOR) are legal, they just couldn't be custom
2333   // lowered.  Return the operation, rather than a null SDValue.
2334   return Op;
2335 }
2336
2337 //! Custom lowering for CTPOP (count population)
2338 /*!
2339   Custom lowering code that counts the number ones in the input
2340   operand. SPU has such an instruction, but it counts the number of
2341   ones per byte, which then have to be accumulated.
2342 */
2343 static SDValue LowerCTPOP(SDValue Op, SelectionDAG &DAG) {
2344   EVT VT = Op.getValueType();
2345   EVT vecVT = EVT::getVectorVT(*DAG.getContext(),
2346                                VT, (128 / VT.getSizeInBits()));
2347   DebugLoc dl = Op.getDebugLoc();
2348
2349   switch (VT.getSimpleVT().SimpleTy) {
2350   default:
2351     assert(false && "Invalid value type!");
2352   case MVT::i8: {
2353     SDValue N = Op.getOperand(0);
2354     SDValue Elt0 = DAG.getConstant(0, MVT::i32);
2355
2356     SDValue Promote = DAG.getNode(SPUISD::PREFSLOT2VEC, dl, vecVT, N, N);
2357     SDValue CNTB = DAG.getNode(SPUISD::CNTB, dl, vecVT, Promote);
2358
2359     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i8, CNTB, Elt0);
2360   }
2361
2362   case MVT::i16: {
2363     MachineFunction &MF = DAG.getMachineFunction();
2364     MachineRegisterInfo &RegInfo = MF.getRegInfo();
2365
2366     unsigned CNTB_reg = RegInfo.createVirtualRegister(&SPU::R16CRegClass);
2367
2368     SDValue N = Op.getOperand(0);
2369     SDValue Elt0 = DAG.getConstant(0, MVT::i16);
2370     SDValue Mask0 = DAG.getConstant(0x0f, MVT::i16);
2371     SDValue Shift1 = DAG.getConstant(8, MVT::i32);
2372
2373     SDValue Promote = DAG.getNode(SPUISD::PREFSLOT2VEC, dl, vecVT, N, N);
2374     SDValue CNTB = DAG.getNode(SPUISD::CNTB, dl, vecVT, Promote);
2375
2376     // CNTB_result becomes the chain to which all of the virtual registers
2377     // CNTB_reg, SUM1_reg become associated:
2378     SDValue CNTB_result =
2379       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, CNTB, Elt0);
2380
2381     SDValue CNTB_rescopy =
2382       DAG.getCopyToReg(CNTB_result, dl, CNTB_reg, CNTB_result);
2383
2384     SDValue Tmp1 = DAG.getCopyFromReg(CNTB_rescopy, dl, CNTB_reg, MVT::i16);
2385
2386     return DAG.getNode(ISD::AND, dl, MVT::i16,
2387                        DAG.getNode(ISD::ADD, dl, MVT::i16,
2388                                    DAG.getNode(ISD::SRL, dl, MVT::i16,
2389                                                Tmp1, Shift1),
2390                                    Tmp1),
2391                        Mask0);
2392   }
2393
2394   case MVT::i32: {
2395     MachineFunction &MF = DAG.getMachineFunction();
2396     MachineRegisterInfo &RegInfo = MF.getRegInfo();
2397
2398     unsigned CNTB_reg = RegInfo.createVirtualRegister(&SPU::R32CRegClass);
2399     unsigned SUM1_reg = RegInfo.createVirtualRegister(&SPU::R32CRegClass);
2400
2401     SDValue N = Op.getOperand(0);
2402     SDValue Elt0 = DAG.getConstant(0, MVT::i32);
2403     SDValue Mask0 = DAG.getConstant(0xff, MVT::i32);
2404     SDValue Shift1 = DAG.getConstant(16, MVT::i32);
2405     SDValue Shift2 = DAG.getConstant(8, MVT::i32);
2406
2407     SDValue Promote = DAG.getNode(SPUISD::PREFSLOT2VEC, dl, vecVT, N, N);
2408     SDValue CNTB = DAG.getNode(SPUISD::CNTB, dl, vecVT, Promote);
2409
2410     // CNTB_result becomes the chain to which all of the virtual registers
2411     // CNTB_reg, SUM1_reg become associated:
2412     SDValue CNTB_result =
2413       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, CNTB, Elt0);
2414
2415     SDValue CNTB_rescopy =
2416       DAG.getCopyToReg(CNTB_result, dl, CNTB_reg, CNTB_result);
2417
2418     SDValue Comp1 =
2419       DAG.getNode(ISD::SRL, dl, MVT::i32,
2420                   DAG.getCopyFromReg(CNTB_rescopy, dl, CNTB_reg, MVT::i32),
2421                   Shift1);
2422
2423     SDValue Sum1 =
2424       DAG.getNode(ISD::ADD, dl, MVT::i32, Comp1,
2425                   DAG.getCopyFromReg(CNTB_rescopy, dl, CNTB_reg, MVT::i32));
2426
2427     SDValue Sum1_rescopy =
2428       DAG.getCopyToReg(CNTB_result, dl, SUM1_reg, Sum1);
2429
2430     SDValue Comp2 =
2431       DAG.getNode(ISD::SRL, dl, MVT::i32,
2432                   DAG.getCopyFromReg(Sum1_rescopy, dl, SUM1_reg, MVT::i32),
2433                   Shift2);
2434     SDValue Sum2 =
2435       DAG.getNode(ISD::ADD, dl, MVT::i32, Comp2,
2436                   DAG.getCopyFromReg(Sum1_rescopy, dl, SUM1_reg, MVT::i32));
2437
2438     return DAG.getNode(ISD::AND, dl, MVT::i32, Sum2, Mask0);
2439   }
2440
2441   case MVT::i64:
2442     break;
2443   }
2444
2445   return SDValue();
2446 }
2447
2448 //! Lower ISD::FP_TO_SINT, ISD::FP_TO_UINT for i32
2449 /*!
2450  f32->i32 passes through unchanged, whereas f64->i32 expands to a libcall.
2451  All conversions to i64 are expanded to a libcall.
2452  */
2453 static SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG,
2454                               const SPUTargetLowering &TLI) {
2455   EVT OpVT = Op.getValueType();
2456   SDValue Op0 = Op.getOperand(0);
2457   EVT Op0VT = Op0.getValueType();
2458
2459   if ((OpVT == MVT::i32 && Op0VT == MVT::f64)
2460       || OpVT == MVT::i64) {
2461     // Convert f32 / f64 to i32 / i64 via libcall.
2462     RTLIB::Libcall LC =
2463             (Op.getOpcode() == ISD::FP_TO_SINT)
2464              ? RTLIB::getFPTOSINT(Op0VT, OpVT)
2465              : RTLIB::getFPTOUINT(Op0VT, OpVT);
2466     assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpectd fp-to-int conversion!");
2467     SDValue Dummy;
2468     return ExpandLibCall(LC, Op, DAG, false, Dummy, TLI);
2469   }
2470
2471   return Op;
2472 }
2473
2474 //! Lower ISD::SINT_TO_FP, ISD::UINT_TO_FP for i32
2475 /*!
2476  i32->f32 passes through unchanged, whereas i32->f64 is expanded to a libcall.
2477  All conversions from i64 are expanded to a libcall.
2478  */
2479 static SDValue LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG,
2480                               const SPUTargetLowering &TLI) {
2481   EVT OpVT = Op.getValueType();
2482   SDValue Op0 = Op.getOperand(0);
2483   EVT Op0VT = Op0.getValueType();
2484
2485   if ((OpVT == MVT::f64 && Op0VT == MVT::i32)
2486       || Op0VT == MVT::i64) {
2487     // Convert i32, i64 to f64 via libcall:
2488     RTLIB::Libcall LC =
2489             (Op.getOpcode() == ISD::SINT_TO_FP)
2490              ? RTLIB::getSINTTOFP(Op0VT, OpVT)
2491              : RTLIB::getUINTTOFP(Op0VT, OpVT);
2492     assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpectd int-to-fp conversion!");
2493     SDValue Dummy;
2494     return ExpandLibCall(LC, Op, DAG, false, Dummy, TLI);
2495   }
2496
2497   return Op;
2498 }
2499
2500 //! Lower ISD::SETCC
2501 /*!
2502  This handles MVT::f64 (double floating point) condition lowering
2503  */
2504 static SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG,
2505                           const TargetLowering &TLI) {
2506   CondCodeSDNode *CC = dyn_cast<CondCodeSDNode>(Op.getOperand(2));
2507   DebugLoc dl = Op.getDebugLoc();
2508   assert(CC != 0 && "LowerSETCC: CondCodeSDNode should not be null here!\n");
2509
2510   SDValue lhs = Op.getOperand(0);
2511   SDValue rhs = Op.getOperand(1);
2512   EVT lhsVT = lhs.getValueType();
2513   assert(lhsVT == MVT::f64 && "LowerSETCC: type other than MVT::64\n");
2514
2515   EVT ccResultVT = TLI.getSetCCResultType(lhs.getValueType());
2516   APInt ccResultOnes = APInt::getAllOnesValue(ccResultVT.getSizeInBits());
2517   EVT IntVT(MVT::i64);
2518
2519   // Take advantage of the fact that (truncate (sra arg, 32)) is efficiently
2520   // selected to a NOP:
2521   SDValue i64lhs = DAG.getNode(ISD::BITCAST, dl, IntVT, lhs);
2522   SDValue lhsHi32 =
2523           DAG.getNode(ISD::TRUNCATE, dl, MVT::i32,
2524                       DAG.getNode(ISD::SRL, dl, IntVT,
2525                                   i64lhs, DAG.getConstant(32, MVT::i32)));
2526   SDValue lhsHi32abs =
2527           DAG.getNode(ISD::AND, dl, MVT::i32,
2528                       lhsHi32, DAG.getConstant(0x7fffffff, MVT::i32));
2529   SDValue lhsLo32 =
2530           DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, i64lhs);
2531
2532   // SETO and SETUO only use the lhs operand:
2533   if (CC->get() == ISD::SETO) {
2534     // Evaluates to true if Op0 is not [SQ]NaN - lowers to the inverse of
2535     // SETUO
2536     APInt ccResultAllOnes = APInt::getAllOnesValue(ccResultVT.getSizeInBits());
2537     return DAG.getNode(ISD::XOR, dl, ccResultVT,
2538                        DAG.getSetCC(dl, ccResultVT,
2539                                     lhs, DAG.getConstantFP(0.0, lhsVT),
2540                                     ISD::SETUO),
2541                        DAG.getConstant(ccResultAllOnes, ccResultVT));
2542   } else if (CC->get() == ISD::SETUO) {
2543     // Evaluates to true if Op0 is [SQ]NaN
2544     return DAG.getNode(ISD::AND, dl, ccResultVT,
2545                        DAG.getSetCC(dl, ccResultVT,
2546                                     lhsHi32abs,
2547                                     DAG.getConstant(0x7ff00000, MVT::i32),
2548                                     ISD::SETGE),
2549                        DAG.getSetCC(dl, ccResultVT,
2550                                     lhsLo32,
2551                                     DAG.getConstant(0, MVT::i32),
2552                                     ISD::SETGT));
2553   }
2554
2555   SDValue i64rhs = DAG.getNode(ISD::BITCAST, dl, IntVT, rhs);
2556   SDValue rhsHi32 =
2557           DAG.getNode(ISD::TRUNCATE, dl, MVT::i32,
2558                       DAG.getNode(ISD::SRL, dl, IntVT,
2559                                   i64rhs, DAG.getConstant(32, MVT::i32)));
2560
2561   // If a value is negative, subtract from the sign magnitude constant:
2562   SDValue signMag2TC = DAG.getConstant(0x8000000000000000ULL, IntVT);
2563
2564   // Convert the sign-magnitude representation into 2's complement:
2565   SDValue lhsSelectMask = DAG.getNode(ISD::SRA, dl, ccResultVT,
2566                                       lhsHi32, DAG.getConstant(31, MVT::i32));
2567   SDValue lhsSignMag2TC = DAG.getNode(ISD::SUB, dl, IntVT, signMag2TC, i64lhs);
2568   SDValue lhsSelect =
2569           DAG.getNode(ISD::SELECT, dl, IntVT,
2570                       lhsSelectMask, lhsSignMag2TC, i64lhs);
2571
2572   SDValue rhsSelectMask = DAG.getNode(ISD::SRA, dl, ccResultVT,
2573                                       rhsHi32, DAG.getConstant(31, MVT::i32));
2574   SDValue rhsSignMag2TC = DAG.getNode(ISD::SUB, dl, IntVT, signMag2TC, i64rhs);
2575   SDValue rhsSelect =
2576           DAG.getNode(ISD::SELECT, dl, IntVT,
2577                       rhsSelectMask, rhsSignMag2TC, i64rhs);
2578
2579   unsigned compareOp;
2580
2581   switch (CC->get()) {
2582   case ISD::SETOEQ:
2583   case ISD::SETUEQ:
2584     compareOp = ISD::SETEQ; break;
2585   case ISD::SETOGT:
2586   case ISD::SETUGT:
2587     compareOp = ISD::SETGT; break;
2588   case ISD::SETOGE:
2589   case ISD::SETUGE:
2590     compareOp = ISD::SETGE; break;
2591   case ISD::SETOLT:
2592   case ISD::SETULT:
2593     compareOp = ISD::SETLT; break;
2594   case ISD::SETOLE:
2595   case ISD::SETULE:
2596     compareOp = ISD::SETLE; break;
2597   case ISD::SETUNE:
2598   case ISD::SETONE:
2599     compareOp = ISD::SETNE; break;
2600   default:
2601     report_fatal_error("CellSPU ISel Select: unimplemented f64 condition");
2602   }
2603
2604   SDValue result =
2605           DAG.getSetCC(dl, ccResultVT, lhsSelect, rhsSelect,
2606                        (ISD::CondCode) compareOp);
2607
2608   if ((CC->get() & 0x8) == 0) {
2609     // Ordered comparison:
2610     SDValue lhsNaN = DAG.getSetCC(dl, ccResultVT,
2611                                   lhs, DAG.getConstantFP(0.0, MVT::f64),
2612                                   ISD::SETO);
2613     SDValue rhsNaN = DAG.getSetCC(dl, ccResultVT,
2614                                   rhs, DAG.getConstantFP(0.0, MVT::f64),
2615                                   ISD::SETO);
2616     SDValue ordered = DAG.getNode(ISD::AND, dl, ccResultVT, lhsNaN, rhsNaN);
2617
2618     result = DAG.getNode(ISD::AND, dl, ccResultVT, ordered, result);
2619   }
2620
2621   return result;
2622 }
2623
2624 //! Lower ISD::SELECT_CC
2625 /*!
2626   ISD::SELECT_CC can (generally) be implemented directly on the SPU using the
2627   SELB instruction.
2628
2629   \note Need to revisit this in the future: if the code path through the true
2630   and false value computations is longer than the latency of a branch (6
2631   cycles), then it would be more advantageous to branch and insert a new basic
2632   block and branch on the condition. However, this code does not make that
2633   assumption, given the simplisitc uses so far.
2634  */
2635
2636 static SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG,
2637                               const TargetLowering &TLI) {
2638   EVT VT = Op.getValueType();
2639   SDValue lhs = Op.getOperand(0);
2640   SDValue rhs = Op.getOperand(1);
2641   SDValue trueval = Op.getOperand(2);
2642   SDValue falseval = Op.getOperand(3);
2643   SDValue condition = Op.getOperand(4);
2644   DebugLoc dl = Op.getDebugLoc();
2645
2646   // NOTE: SELB's arguments: $rA, $rB, $mask
2647   //
2648   // SELB selects bits from $rA where bits in $mask are 0, bits from $rB
2649   // where bits in $mask are 1. CCond will be inverted, having 1s where the
2650   // condition was true and 0s where the condition was false. Hence, the
2651   // arguments to SELB get reversed.
2652
2653   // Note: Really should be ISD::SELECT instead of SPUISD::SELB, but LLVM's
2654   // legalizer insists on combining SETCC/SELECT into SELECT_CC, so we end up
2655   // with another "cannot select select_cc" assert:
2656
2657   SDValue compare = DAG.getNode(ISD::SETCC, dl,
2658                                 TLI.getSetCCResultType(Op.getValueType()),
2659                                 lhs, rhs, condition);
2660   return DAG.getNode(SPUISD::SELB, dl, VT, falseval, trueval, compare);
2661 }
2662
2663 //! Custom lower ISD::TRUNCATE
2664 static SDValue LowerTRUNCATE(SDValue Op, SelectionDAG &DAG)
2665 {
2666   // Type to truncate to
2667   EVT VT = Op.getValueType();
2668   MVT simpleVT = VT.getSimpleVT();
2669   EVT VecVT = EVT::getVectorVT(*DAG.getContext(),
2670                                VT, (128 / VT.getSizeInBits()));
2671   DebugLoc dl = Op.getDebugLoc();
2672
2673   // Type to truncate from
2674   SDValue Op0 = Op.getOperand(0);
2675   EVT Op0VT = Op0.getValueType();
2676
2677   if (Op0VT == MVT::i128 && simpleVT == MVT::i64) {
2678     // Create shuffle mask, least significant doubleword of quadword
2679     unsigned maskHigh = 0x08090a0b;
2680     unsigned maskLow = 0x0c0d0e0f;
2681     // Use a shuffle to perform the truncation
2682     SDValue shufMask = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
2683                                    DAG.getConstant(maskHigh, MVT::i32),
2684                                    DAG.getConstant(maskLow, MVT::i32),
2685                                    DAG.getConstant(maskHigh, MVT::i32),
2686                                    DAG.getConstant(maskLow, MVT::i32));
2687
2688     SDValue truncShuffle = DAG.getNode(SPUISD::SHUFB, dl, VecVT,
2689                                        Op0, Op0, shufMask);
2690
2691     return DAG.getNode(SPUISD::VEC2PREFSLOT, dl, VT, truncShuffle);
2692   }
2693
2694   return SDValue();             // Leave the truncate unmolested
2695 }
2696
2697 /*!
2698  * Emit the instruction sequence for i64/i32 -> i128 sign extend. The basic
2699  * algorithm is to duplicate the sign bit using rotmai to generate at
2700  * least one byte full of sign bits. Then propagate the "sign-byte" into
2701  * the leftmost words and the i64/i32 into the rightmost words using shufb.
2702  *
2703  * @param Op The sext operand
2704  * @param DAG The current DAG
2705  * @return The SDValue with the entire instruction sequence
2706  */
2707 static SDValue LowerSIGN_EXTEND(SDValue Op, SelectionDAG &DAG)
2708 {
2709   DebugLoc dl = Op.getDebugLoc();
2710
2711   // Type to extend to
2712   MVT OpVT = Op.getValueType().getSimpleVT();
2713
2714   // Type to extend from
2715   SDValue Op0 = Op.getOperand(0);
2716   MVT Op0VT = Op0.getValueType().getSimpleVT();
2717
2718   // extend i8 & i16 via i32
2719   if (Op0VT == MVT::i8 || Op0VT == MVT::i16) {
2720     Op0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i32, Op0);
2721     Op0VT = MVT::i32;
2722   }
2723
2724   // The type to extend to needs to be a i128 and
2725   // the type to extend from needs to be i64 or i32.
2726   assert((OpVT == MVT::i128 && (Op0VT == MVT::i64 || Op0VT == MVT::i32)) &&
2727           "LowerSIGN_EXTEND: input and/or output operand have wrong size");
2728
2729   // Create shuffle mask
2730   unsigned mask1 = 0x10101010; // byte 0 - 3 and 4 - 7
2731   unsigned mask2 = Op0VT == MVT::i64 ? 0x00010203 : 0x10101010; // byte  8 - 11
2732   unsigned mask3 = Op0VT == MVT::i64 ? 0x04050607 : 0x00010203; // byte 12 - 15
2733   SDValue shufMask = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
2734                                  DAG.getConstant(mask1, MVT::i32),
2735                                  DAG.getConstant(mask1, MVT::i32),
2736                                  DAG.getConstant(mask2, MVT::i32),
2737                                  DAG.getConstant(mask3, MVT::i32));
2738
2739   // Word wise arithmetic right shift to generate at least one byte
2740   // that contains sign bits.
2741   MVT mvt = Op0VT == MVT::i64 ? MVT::v2i64 : MVT::v4i32;
2742   SDValue sraVal = DAG.getNode(ISD::SRA,
2743                  dl,
2744                  mvt,
2745                  DAG.getNode(SPUISD::PREFSLOT2VEC, dl, mvt, Op0, Op0),
2746                  DAG.getConstant(31, MVT::i32));
2747
2748   // reinterpret as a i128 (SHUFB requires it). This gets lowered away.
2749   SDValue extended = SDValue(DAG.getMachineNode(TargetOpcode::COPY_TO_REGCLASS,
2750                                         dl, Op0VT, Op0,
2751                                         DAG.getTargetConstant(
2752                                                   SPU::GPRCRegClass.getID(),
2753                                                   MVT::i32)), 0);
2754   // Shuffle bytes - Copy the sign bits into the upper 64 bits
2755   // and the input value into the lower 64 bits.
2756   SDValue extShuffle = DAG.getNode(SPUISD::SHUFB, dl, mvt,
2757         extended, sraVal, shufMask);
2758   return DAG.getNode(ISD::BITCAST, dl, MVT::i128, extShuffle);
2759 }
2760
2761 //! Custom (target-specific) lowering entry point
2762 /*!
2763   This is where LLVM's DAG selection process calls to do target-specific
2764   lowering of nodes.
2765  */
2766 SDValue
2767 SPUTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const
2768 {
2769   unsigned Opc = (unsigned) Op.getOpcode();
2770   EVT VT = Op.getValueType();
2771
2772   switch (Opc) {
2773   default: {
2774 #ifndef NDEBUG
2775     errs() << "SPUTargetLowering::LowerOperation(): need to lower this!\n";
2776     errs() << "Op.getOpcode() = " << Opc << "\n";
2777     errs() << "*Op.getNode():\n";
2778     Op.getNode()->dump();
2779 #endif
2780     llvm_unreachable(0);
2781   }
2782   case ISD::LOAD:
2783   case ISD::EXTLOAD:
2784   case ISD::SEXTLOAD:
2785   case ISD::ZEXTLOAD:
2786     return LowerLOAD(Op, DAG, SPUTM.getSubtargetImpl());
2787   case ISD::STORE:
2788     return LowerSTORE(Op, DAG, SPUTM.getSubtargetImpl());
2789   case ISD::ConstantPool:
2790     return LowerConstantPool(Op, DAG, SPUTM.getSubtargetImpl());
2791   case ISD::GlobalAddress:
2792     return LowerGlobalAddress(Op, DAG, SPUTM.getSubtargetImpl());
2793   case ISD::JumpTable:
2794     return LowerJumpTable(Op, DAG, SPUTM.getSubtargetImpl());
2795   case ISD::ConstantFP:
2796     return LowerConstantFP(Op, DAG);
2797
2798   // i8, i64 math ops:
2799   case ISD::ADD:
2800   case ISD::SUB:
2801   case ISD::ROTR:
2802   case ISD::ROTL:
2803   case ISD::SRL:
2804   case ISD::SHL:
2805   case ISD::SRA: {
2806     if (VT == MVT::i8)
2807       return LowerI8Math(Op, DAG, Opc, *this);
2808     break;
2809   }
2810
2811   case ISD::FP_TO_SINT:
2812   case ISD::FP_TO_UINT:
2813     return LowerFP_TO_INT(Op, DAG, *this);
2814
2815   case ISD::SINT_TO_FP:
2816   case ISD::UINT_TO_FP:
2817     return LowerINT_TO_FP(Op, DAG, *this);
2818
2819   // Vector-related lowering.
2820   case ISD::BUILD_VECTOR:
2821     return LowerBUILD_VECTOR(Op, DAG);
2822   case ISD::SCALAR_TO_VECTOR:
2823     return LowerSCALAR_TO_VECTOR(Op, DAG);
2824   case ISD::VECTOR_SHUFFLE:
2825     return LowerVECTOR_SHUFFLE(Op, DAG);
2826   case ISD::EXTRACT_VECTOR_ELT:
2827     return LowerEXTRACT_VECTOR_ELT(Op, DAG);
2828   case ISD::INSERT_VECTOR_ELT:
2829     return LowerINSERT_VECTOR_ELT(Op, DAG);
2830
2831   // Look for ANDBI, ORBI and XORBI opportunities and lower appropriately:
2832   case ISD::AND:
2833   case ISD::OR:
2834   case ISD::XOR:
2835     return LowerByteImmed(Op, DAG);
2836
2837   // Vector and i8 multiply:
2838   case ISD::MUL:
2839     if (VT == MVT::i8)
2840       return LowerI8Math(Op, DAG, Opc, *this);
2841
2842   case ISD::CTPOP:
2843     return LowerCTPOP(Op, DAG);
2844
2845   case ISD::SELECT_CC:
2846     return LowerSELECT_CC(Op, DAG, *this);
2847
2848   case ISD::SETCC:
2849     return LowerSETCC(Op, DAG, *this);
2850
2851   case ISD::TRUNCATE:
2852     return LowerTRUNCATE(Op, DAG);
2853
2854   case ISD::SIGN_EXTEND:
2855     return LowerSIGN_EXTEND(Op, DAG);
2856   }
2857
2858   return SDValue();
2859 }
2860
2861 void SPUTargetLowering::ReplaceNodeResults(SDNode *N,
2862                                            SmallVectorImpl<SDValue>&Results,
2863                                            SelectionDAG &DAG) const
2864 {
2865 #if 0
2866   unsigned Opc = (unsigned) N->getOpcode();
2867   EVT OpVT = N->getValueType(0);
2868
2869   switch (Opc) {
2870   default: {
2871     errs() << "SPUTargetLowering::ReplaceNodeResults(): need to fix this!\n";
2872     errs() << "Op.getOpcode() = " << Opc << "\n";
2873     errs() << "*Op.getNode():\n";
2874     N->dump();
2875     abort();
2876     /*NOTREACHED*/
2877   }
2878   }
2879 #endif
2880
2881   /* Otherwise, return unchanged */
2882 }
2883
2884 //===----------------------------------------------------------------------===//
2885 // Target Optimization Hooks
2886 //===----------------------------------------------------------------------===//
2887
2888 SDValue
2889 SPUTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const
2890 {
2891 #if 0
2892   TargetMachine &TM = getTargetMachine();
2893 #endif
2894   const SPUSubtarget *ST = SPUTM.getSubtargetImpl();
2895   SelectionDAG &DAG = DCI.DAG;
2896   SDValue Op0 = N->getOperand(0);       // everything has at least one operand
2897   EVT NodeVT = N->getValueType(0);      // The node's value type
2898   EVT Op0VT = Op0.getValueType();       // The first operand's result
2899   SDValue Result;                       // Initially, empty result
2900   DebugLoc dl = N->getDebugLoc();
2901
2902   switch (N->getOpcode()) {
2903   default: break;
2904   case ISD::ADD: {
2905     SDValue Op1 = N->getOperand(1);
2906
2907     if (Op0.getOpcode() == SPUISD::IndirectAddr
2908         || Op1.getOpcode() == SPUISD::IndirectAddr) {
2909       // Normalize the operands to reduce repeated code
2910       SDValue IndirectArg = Op0, AddArg = Op1;
2911
2912       if (Op1.getOpcode() == SPUISD::IndirectAddr) {
2913         IndirectArg = Op1;
2914         AddArg = Op0;
2915       }
2916
2917       if (isa<ConstantSDNode>(AddArg)) {
2918         ConstantSDNode *CN0 = cast<ConstantSDNode > (AddArg);
2919         SDValue IndOp1 = IndirectArg.getOperand(1);
2920
2921         if (CN0->isNullValue()) {
2922           // (add (SPUindirect <arg>, <arg>), 0) ->
2923           // (SPUindirect <arg>, <arg>)
2924
2925 #if !defined(NDEBUG)
2926           if (DebugFlag && isCurrentDebugType(DEBUG_TYPE)) {
2927             errs() << "\n"
2928                  << "Replace: (add (SPUindirect <arg>, <arg>), 0)\n"
2929                  << "With:    (SPUindirect <arg>, <arg>)\n";
2930           }
2931 #endif
2932
2933           return IndirectArg;
2934         } else if (isa<ConstantSDNode>(IndOp1)) {
2935           // (add (SPUindirect <arg>, <const>), <const>) ->
2936           // (SPUindirect <arg>, <const + const>)
2937           ConstantSDNode *CN1 = cast<ConstantSDNode > (IndOp1);
2938           int64_t combinedConst = CN0->getSExtValue() + CN1->getSExtValue();
2939           SDValue combinedValue = DAG.getConstant(combinedConst, Op0VT);
2940
2941 #if !defined(NDEBUG)
2942           if (DebugFlag && isCurrentDebugType(DEBUG_TYPE)) {
2943             errs() << "\n"
2944                  << "Replace: (add (SPUindirect <arg>, " << CN1->getSExtValue()
2945                  << "), " << CN0->getSExtValue() << ")\n"
2946                  << "With:    (SPUindirect <arg>, "
2947                  << combinedConst << ")\n";
2948           }
2949 #endif
2950
2951           return DAG.getNode(SPUISD::IndirectAddr, dl, Op0VT,
2952                              IndirectArg, combinedValue);
2953         }
2954       }
2955     }
2956     break;
2957   }
2958   case ISD::SIGN_EXTEND:
2959   case ISD::ZERO_EXTEND:
2960   case ISD::ANY_EXTEND: {
2961     if (Op0.getOpcode() == SPUISD::VEC2PREFSLOT && NodeVT == Op0VT) {
2962       // (any_extend (SPUextract_elt0 <arg>)) ->
2963       // (SPUextract_elt0 <arg>)
2964       // Types must match, however...
2965 #if !defined(NDEBUG)
2966       if (DebugFlag && isCurrentDebugType(DEBUG_TYPE)) {
2967         errs() << "\nReplace: ";
2968         N->dump(&DAG);
2969         errs() << "\nWith:    ";
2970         Op0.getNode()->dump(&DAG);
2971         errs() << "\n";
2972       }
2973 #endif
2974
2975       return Op0;
2976     }
2977     break;
2978   }
2979   case SPUISD::IndirectAddr: {
2980     if (!ST->usingLargeMem() && Op0.getOpcode() == SPUISD::AFormAddr) {
2981       ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N->getOperand(1));
2982       if (CN != 0 && CN->isNullValue()) {
2983         // (SPUindirect (SPUaform <addr>, 0), 0) ->
2984         // (SPUaform <addr>, 0)
2985
2986         DEBUG(errs() << "Replace: ");
2987         DEBUG(N->dump(&DAG));
2988         DEBUG(errs() << "\nWith:    ");
2989         DEBUG(Op0.getNode()->dump(&DAG));
2990         DEBUG(errs() << "\n");
2991
2992         return Op0;
2993       }
2994     } else if (Op0.getOpcode() == ISD::ADD) {
2995       SDValue Op1 = N->getOperand(1);
2996       if (ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(Op1)) {
2997         // (SPUindirect (add <arg>, <arg>), 0) ->
2998         // (SPUindirect <arg>, <arg>)
2999         if (CN1->isNullValue()) {
3000
3001 #if !defined(NDEBUG)
3002           if (DebugFlag && isCurrentDebugType(DEBUG_TYPE)) {
3003             errs() << "\n"
3004                  << "Replace: (SPUindirect (add <arg>, <arg>), 0)\n"
3005                  << "With:    (SPUindirect <arg>, <arg>)\n";
3006           }
3007 #endif
3008
3009           return DAG.getNode(SPUISD::IndirectAddr, dl, Op0VT,
3010                              Op0.getOperand(0), Op0.getOperand(1));
3011         }
3012       }
3013     }
3014     break;
3015   }
3016   case SPUISD::SHL_BITS:
3017   case SPUISD::SHL_BYTES:
3018   case SPUISD::ROTBYTES_LEFT: {
3019     SDValue Op1 = N->getOperand(1);
3020
3021     // Kill degenerate vector shifts:
3022     if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Op1)) {
3023       if (CN->isNullValue()) {
3024         Result = Op0;
3025       }
3026     }
3027     break;
3028   }
3029   case SPUISD::PREFSLOT2VEC: {
3030     switch (Op0.getOpcode()) {
3031     default:
3032       break;
3033     case ISD::ANY_EXTEND:
3034     case ISD::ZERO_EXTEND:
3035     case ISD::SIGN_EXTEND: {
3036       // (SPUprefslot2vec (any|zero|sign_extend (SPUvec2prefslot <arg>))) ->
3037       // <arg>
3038       // but only if the SPUprefslot2vec and <arg> types match.
3039       SDValue Op00 = Op0.getOperand(0);
3040       if (Op00.getOpcode() == SPUISD::VEC2PREFSLOT) {
3041         SDValue Op000 = Op00.getOperand(0);
3042         if (Op000.getValueType() == NodeVT) {
3043           Result = Op000;
3044         }
3045       }
3046       break;
3047     }
3048     case SPUISD::VEC2PREFSLOT: {
3049       // (SPUprefslot2vec (SPUvec2prefslot <arg>)) ->
3050       // <arg>
3051       Result = Op0.getOperand(0);
3052       break;
3053     }
3054     }
3055     break;
3056   }
3057   }
3058
3059   // Otherwise, return unchanged.
3060 #ifndef NDEBUG
3061   if (Result.getNode()) {
3062     DEBUG(errs() << "\nReplace.SPU: ");
3063     DEBUG(N->dump(&DAG));
3064     DEBUG(errs() << "\nWith:        ");
3065     DEBUG(Result.getNode()->dump(&DAG));
3066     DEBUG(errs() << "\n");
3067   }
3068 #endif
3069
3070   return Result;
3071 }
3072
3073 //===----------------------------------------------------------------------===//
3074 // Inline Assembly Support
3075 //===----------------------------------------------------------------------===//
3076
3077 /// getConstraintType - Given a constraint letter, return the type of
3078 /// constraint it is for this target.
3079 SPUTargetLowering::ConstraintType
3080 SPUTargetLowering::getConstraintType(const std::string &ConstraintLetter) const {
3081   if (ConstraintLetter.size() == 1) {
3082     switch (ConstraintLetter[0]) {
3083     default: break;
3084     case 'b':
3085     case 'r':
3086     case 'f':
3087     case 'v':
3088     case 'y':
3089       return C_RegisterClass;
3090     }
3091   }
3092   return TargetLowering::getConstraintType(ConstraintLetter);
3093 }
3094
3095 /// Examine constraint type and operand type and determine a weight value.
3096 /// This object must already have been set up with the operand type
3097 /// and the current alternative constraint selected.
3098 TargetLowering::ConstraintWeight
3099 SPUTargetLowering::getSingleConstraintMatchWeight(
3100     AsmOperandInfo &info, const char *constraint) const {
3101   ConstraintWeight weight = CW_Invalid;
3102   Value *CallOperandVal = info.CallOperandVal;
3103     // If we don't have a value, we can't do a match,
3104     // but allow it at the lowest weight.
3105   if (CallOperandVal == NULL)
3106     return CW_Default;
3107   // Look at the constraint type.
3108   switch (*constraint) {
3109   default:
3110     weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
3111     break;
3112     //FIXME: Seems like the supported constraint letters were just copied
3113     // from PPC, as the following doesn't correspond to the GCC docs.
3114     // I'm leaving it so until someone adds the corresponding lowering support.
3115   case 'b':
3116   case 'r':
3117   case 'f':
3118   case 'd':
3119   case 'v':
3120   case 'y':
3121     weight = CW_Register;
3122     break;
3123   }
3124   return weight;
3125 }
3126
3127 std::pair<unsigned, const TargetRegisterClass*>
3128 SPUTargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
3129                                                 EVT VT) const
3130 {
3131   if (Constraint.size() == 1) {
3132     // GCC RS6000 Constraint Letters
3133     switch (Constraint[0]) {
3134     case 'b':   // R1-R31
3135     case 'r':   // R0-R31
3136       if (VT == MVT::i64)
3137         return std::make_pair(0U, SPU::R64CRegisterClass);
3138       return std::make_pair(0U, SPU::R32CRegisterClass);
3139     case 'f':
3140       if (VT == MVT::f32)
3141         return std::make_pair(0U, SPU::R32FPRegisterClass);
3142       else if (VT == MVT::f64)
3143         return std::make_pair(0U, SPU::R64FPRegisterClass);
3144       break;
3145     case 'v':
3146       return std::make_pair(0U, SPU::GPRCRegisterClass);
3147     }
3148   }
3149
3150   return TargetLowering::getRegForInlineAsmConstraint(Constraint, VT);
3151 }
3152
3153 //! Compute used/known bits for a SPU operand
3154 void
3155 SPUTargetLowering::computeMaskedBitsForTargetNode(const SDValue Op,
3156                                                   const APInt &Mask,
3157                                                   APInt &KnownZero,
3158                                                   APInt &KnownOne,
3159                                                   const SelectionDAG &DAG,
3160                                                   unsigned Depth ) const {
3161 #if 0
3162   const uint64_t uint64_sizebits = sizeof(uint64_t) * CHAR_BIT;
3163
3164   switch (Op.getOpcode()) {
3165   default:
3166     // KnownZero = KnownOne = APInt(Mask.getBitWidth(), 0);
3167     break;
3168   case CALL:
3169   case SHUFB:
3170   case SHUFFLE_MASK:
3171   case CNTB:
3172   case SPUISD::PREFSLOT2VEC:
3173   case SPUISD::LDRESULT:
3174   case SPUISD::VEC2PREFSLOT:
3175   case SPUISD::SHLQUAD_L_BITS:
3176   case SPUISD::SHLQUAD_L_BYTES:
3177   case SPUISD::VEC_ROTL:
3178   case SPUISD::VEC_ROTR:
3179   case SPUISD::ROTBYTES_LEFT:
3180   case SPUISD::SELECT_MASK:
3181   case SPUISD::SELB:
3182   }
3183 #endif
3184 }
3185
3186 unsigned
3187 SPUTargetLowering::ComputeNumSignBitsForTargetNode(SDValue Op,
3188                                                    unsigned Depth) const {
3189   switch (Op.getOpcode()) {
3190   default:
3191     return 1;
3192
3193   case ISD::SETCC: {
3194     EVT VT = Op.getValueType();
3195
3196     if (VT != MVT::i8 && VT != MVT::i16 && VT != MVT::i32) {
3197       VT = MVT::i32;
3198     }
3199     return VT.getSizeInBits();
3200   }
3201   }
3202 }
3203
3204 // LowerAsmOperandForConstraint
3205 void
3206 SPUTargetLowering::LowerAsmOperandForConstraint(SDValue Op,
3207                                                 std::string &Constraint,
3208                                                 std::vector<SDValue> &Ops,
3209                                                 SelectionDAG &DAG) const {
3210   // Default, for the time being, to the base class handler
3211   TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
3212 }
3213
3214 /// isLegalAddressImmediate - Return true if the integer value can be used
3215 /// as the offset of the target addressing mode.
3216 bool SPUTargetLowering::isLegalAddressImmediate(int64_t V,
3217                                                 const Type *Ty) const {
3218   // SPU's addresses are 256K:
3219   return (V > -(1 << 18) && V < (1 << 18) - 1);
3220 }
3221
3222 bool SPUTargetLowering::isLegalAddressImmediate(llvm::GlobalValue* GV) const {
3223   return false;
3224 }
3225
3226 bool
3227 SPUTargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
3228   // The SPU target isn't yet aware of offsets.
3229   return false;
3230 }
3231
3232 // can we compare to Imm without writing it into a register?
3233 bool SPUTargetLowering::isLegalICmpImmediate(int64_t Imm) const {
3234   //ceqi, cgti, etc. all take s10 operand
3235   return isInt<10>(Imm);
3236 }
3237
3238 bool
3239 SPUTargetLowering::isLegalAddressingMode(const AddrMode &AM,
3240                                          const Type * ) const{
3241
3242   // A-form: 18bit absolute address.
3243   if (AM.BaseGV && !AM.HasBaseReg && AM.Scale == 0 && AM.BaseOffs == 0)
3244     return true;
3245
3246   // D-form: reg + 14bit offset
3247   if (AM.BaseGV ==0 && AM.HasBaseReg && AM.Scale == 0 && isInt<14>(AM.BaseOffs))
3248     return true;
3249
3250   // X-form: reg+reg
3251   if (AM.BaseGV == 0 && AM.HasBaseReg && AM.Scale == 1 && AM.BaseOffs ==0)
3252     return true;
3253
3254   return false;
3255 }