lib/Target/R600/R600ISelLowering.cpp

   1 //===-- R600ISelLowering.cpp - R600 DAG Lowering Implementation -----------===//
   2 //
   3 //                     The LLVM Compiler Infrastructure
   4 //
   5 // This file is distributed under the University of Illinois Open Source
   6 // License. See LICENSE.TXT for details.
   7 //
   8 //===----------------------------------------------------------------------===//
   9 //
  10 /// \file
  11 /// \brief Custom DAG lowering for R600
  12 //
  13 //===----------------------------------------------------------------------===//
  14
  15 #include "R600ISelLowering.h"
  16 #include "AMDGPUFrameLowering.h"
  17 #include "AMDGPUIntrinsicInfo.h"
  18 #include "AMDGPUSubtarget.h"
  19 #include "R600Defines.h"
  20 #include "R600InstrInfo.h"
  21 #include "R600MachineFunctionInfo.h"
  22 #include "llvm/Analysis/ValueTracking.h"
  23 #include "llvm/CodeGen/CallingConvLower.h"
  24 #include "llvm/CodeGen/MachineFrameInfo.h"
  25 #include "llvm/CodeGen/MachineInstrBuilder.h"
  26 #include "llvm/CodeGen/MachineRegisterInfo.h"
  27 #include "llvm/CodeGen/SelectionDAG.h"
  28 #include "llvm/IR/Argument.h"
  29 #include "llvm/IR/Function.h"
  30
  31 using namespace llvm;
  32
  33 R600TargetLowering::R600TargetLowering(TargetMachine &TM,
  34                                        const AMDGPUSubtarget &STI)
  35     : AMDGPUTargetLowering(TM, STI), Gen(STI.getGeneration()) {
  36   addRegisterClass(MVT::v4f32, &AMDGPU::R600_Reg128RegClass);
  37   addRegisterClass(MVT::f32, &AMDGPU::R600_Reg32RegClass);
  38   addRegisterClass(MVT::v4i32, &AMDGPU::R600_Reg128RegClass);
  39   addRegisterClass(MVT::i32, &AMDGPU::R600_Reg32RegClass);
  40   addRegisterClass(MVT::v2f32, &AMDGPU::R600_Reg64RegClass);
  41   addRegisterClass(MVT::v2i32, &AMDGPU::R600_Reg64RegClass);
  42
  43   computeRegisterProperties(STI.getRegisterInfo());
  44
  45   // Set condition code actions
  46   setCondCodeAction(ISD::SETO,   MVT::f32, Expand);
  47   setCondCodeAction(ISD::SETUO,  MVT::f32, Expand);
  48   setCondCodeAction(ISD::SETLT,  MVT::f32, Expand);
  49   setCondCodeAction(ISD::SETLE,  MVT::f32, Expand);
  50   setCondCodeAction(ISD::SETOLT, MVT::f32, Expand);
  51   setCondCodeAction(ISD::SETOLE, MVT::f32, Expand);
  52   setCondCodeAction(ISD::SETONE, MVT::f32, Expand);
  53   setCondCodeAction(ISD::SETUEQ, MVT::f32, Expand);
  54   setCondCodeAction(ISD::SETUGE, MVT::f32, Expand);
  55   setCondCodeAction(ISD::SETUGT, MVT::f32, Expand);
  56   setCondCodeAction(ISD::SETULT, MVT::f32, Expand);
  57   setCondCodeAction(ISD::SETULE, MVT::f32, Expand);
  58
  59   setCondCodeAction(ISD::SETLE, MVT::i32, Expand);
  60   setCondCodeAction(ISD::SETLT, MVT::i32, Expand);
  61   setCondCodeAction(ISD::SETULE, MVT::i32, Expand);
  62   setCondCodeAction(ISD::SETULT, MVT::i32, Expand);
  63
  64   setOperationAction(ISD::FCOS, MVT::f32, Custom);
  65   setOperationAction(ISD::FSIN, MVT::f32, Custom);
  66
  67   setOperationAction(ISD::SETCC, MVT::v4i32, Expand);
  68   setOperationAction(ISD::SETCC, MVT::v2i32, Expand);
  69
  70   setOperationAction(ISD::BR_CC, MVT::i32, Expand);
  71   setOperationAction(ISD::BR_CC, MVT::f32, Expand);
  72   setOperationAction(ISD::BRCOND, MVT::Other, Custom);
  73
  74   setOperationAction(ISD::FSUB, MVT::f32, Expand);
  75
  76   setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
  77   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
  78   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i1, Custom);
  79
  80   setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
  81   setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
  82
  83   setOperationAction(ISD::SETCC, MVT::i32, Expand);
  84   setOperationAction(ISD::SETCC, MVT::f32, Expand);
  85   setOperationAction(ISD::FP_TO_UINT, MVT::i1, Custom);
  86   setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
  87   setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);
  88
  89   setOperationAction(ISD::SELECT, MVT::i32, Expand);
  90   setOperationAction(ISD::SELECT, MVT::f32, Expand);
  91   setOperationAction(ISD::SELECT, MVT::v2i32, Expand);
  92   setOperationAction(ISD::SELECT, MVT::v4i32, Expand);
  93
  94   // ADD, SUB overflow.
  95   // TODO: turn these into Legal?
  96   if (Subtarget->hasCARRY())
  97     setOperationAction(ISD::UADDO, MVT::i32, Custom);
  98
  99   if (Subtarget->hasBORROW())
 100     setOperationAction(ISD::USUBO, MVT::i32, Custom);
 101
 102   // Expand sign extension of vectors
 103   if (!Subtarget->hasBFE())
 104     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
 105
 106   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i1, Expand);
 107   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i1, Expand);
 108
 109   if (!Subtarget->hasBFE())
 110     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8, Expand);
 111   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i8, Expand);
 112   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i8, Expand);
 113
 114   if (!Subtarget->hasBFE())
 115     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Expand);
 116   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Expand);
 117   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Expand);
 118
 119   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
 120   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i32, Expand);
 121   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i32, Expand);
 122
 123   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::Other, Expand);
 124
 125
 126   // Legalize loads and stores to the private address space.
 127   setOperationAction(ISD::LOAD, MVT::i32, Custom);
 128   setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
 129   setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
 130
 131   // EXTLOAD should be the same as ZEXTLOAD. It is legal for some address
 132   // spaces, so it is custom lowered to handle those where it isn't.
 133   for (MVT VT : MVT::integer_valuetypes()) {
 134     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
 135     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i8, Custom);
 136     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i16, Custom);
 137
 138     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote);
 139     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i8, Custom);
 140     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i16, Custom);
 141
 142     setLoadExtAction(ISD::EXTLOAD, VT, MVT::i1, Promote);
 143     setLoadExtAction(ISD::EXTLOAD, VT, MVT::i8, Custom);
 144     setLoadExtAction(ISD::EXTLOAD, VT, MVT::i16, Custom);
 145   }
 146
 147   setOperationAction(ISD::STORE, MVT::i8, Custom);
 148   setOperationAction(ISD::STORE, MVT::i32, Custom);
 149   setOperationAction(ISD::STORE, MVT::v2i32, Custom);
 150   setOperationAction(ISD::STORE, MVT::v4i32, Custom);
 151   setTruncStoreAction(MVT::i32, MVT::i8, Custom);
 152   setTruncStoreAction(MVT::i32, MVT::i16, Custom);
 153
 154   setOperationAction(ISD::LOAD, MVT::i32, Custom);
 155   setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
 156   setOperationAction(ISD::FrameIndex, MVT::i32, Custom);
 157
 158   setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i32, Custom);
 159   setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f32, Custom);
 160   setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom);
 161   setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
 162
 163   setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i32, Custom);
 164   setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f32, Custom);
 165   setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom);
 166   setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom);
 167
 168   setTargetDAGCombine(ISD::FP_ROUND);
 169   setTargetDAGCombine(ISD::FP_TO_SINT);
 170   setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
 171   setTargetDAGCombine(ISD::SELECT_CC);
 172   setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
 173
 174   // These should be replaced by UDVIREM, but it does not happen automatically
 175   // during Type Legalization
 176   setOperationAction(ISD::UDIV, MVT::i64, Custom);
 177   setOperationAction(ISD::UREM, MVT::i64, Custom);
 178   setOperationAction(ISD::SDIV, MVT::i64, Custom);
 179   setOperationAction(ISD::SREM, MVT::i64, Custom);
 180
 181   // We don't have 64-bit shifts. Thus we need either SHX i64 or SHX_PARTS i32
 182   //  to be Legal/Custom in order to avoid library calls.
 183   setOperationAction(ISD::SHL_PARTS, MVT::i32, Custom);
 184   setOperationAction(ISD::SRL_PARTS, MVT::i32, Custom);
 185   setOperationAction(ISD::SRA_PARTS, MVT::i32, Custom);
 186
 187   setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
 188
 189   const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 };
 190   for (MVT VT : ScalarIntVTs) {
 191     setOperationAction(ISD::ADDC, VT, Expand);
 192     setOperationAction(ISD::SUBC, VT, Expand);
 193     setOperationAction(ISD::ADDE, VT, Expand);
 194     setOperationAction(ISD::SUBE, VT, Expand);
 195   }
 196
 197   setSchedulingPreference(Sched::Source);
 198 }
 199
 200 MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter(
 201     MachineInstr * MI, MachineBasicBlock * BB) const {
 202   MachineFunction * MF = BB->getParent();
 203   MachineRegisterInfo &MRI = MF->getRegInfo();
 204   MachineBasicBlock::iterator I = *MI;
 205   const R600InstrInfo *TII =
 206       static_cast<const R600InstrInfo *>(Subtarget->getInstrInfo());
 207
 208   switch (MI->getOpcode()) {
 209   default:
 210     // Replace LDS_*_RET instruction that don't have any uses with the
 211     // equivalent LDS_*_NORET instruction.
 212     if (TII->isLDSRetInstr(MI->getOpcode())) {
 213       int DstIdx = TII->getOperandIdx(MI->getOpcode(), AMDGPU::OpName::dst);
 214       assert(DstIdx != -1);
 215       MachineInstrBuilder NewMI;
 216       // FIXME: getLDSNoRetOp method only handles LDS_1A1D LDS ops. Add
 217       //        LDS_1A2D support and remove this special case.
 218       if (!MRI.use_empty(MI->getOperand(DstIdx).getReg()) ||
 219            MI->getOpcode() == AMDGPU::LDS_CMPST_RET)
 220         return BB;
 221
 222       NewMI = BuildMI(*BB, I, BB->findDebugLoc(I),
 223                       TII->get(AMDGPU::getLDSNoRetOp(MI->getOpcode())));
 224       for (unsigned i = 1, e = MI->getNumOperands(); i < e; ++i) {
 225         NewMI.addOperand(MI->getOperand(i));
 226       }
 227     } else {
 228       return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
 229     }
 230     break;
 231   case AMDGPU::CLAMP_R600: {
 232     MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
 233                                                    AMDGPU::MOV,
 234                                                    MI->getOperand(0).getReg(),
 235                                                    MI->getOperand(1).getReg());
 236     TII->addFlag(NewMI, 0, MO_FLAG_CLAMP);
 237     break;
 238   }
 239
 240   case AMDGPU::FABS_R600: {
 241     MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
 242                                                     AMDGPU::MOV,
 243                                                     MI->getOperand(0).getReg(),
 244                                                     MI->getOperand(1).getReg());
 245     TII->addFlag(NewMI, 0, MO_FLAG_ABS);
 246     break;
 247   }
 248
 249   case AMDGPU::FNEG_R600: {
 250     MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
 251                                                     AMDGPU::MOV,
 252                                                     MI->getOperand(0).getReg(),
 253                                                     MI->getOperand(1).getReg());
 254     TII->addFlag(NewMI, 0, MO_FLAG_NEG);
 255     break;
 256   }
 257
 258   case AMDGPU::MASK_WRITE: {
 259     unsigned maskedRegister = MI->getOperand(0).getReg();
 260     assert(TargetRegisterInfo::isVirtualRegister(maskedRegister));
 261     MachineInstr * defInstr = MRI.getVRegDef(maskedRegister);
 262     TII->addFlag(defInstr, 0, MO_FLAG_MASK);
 263     break;
 264   }
 265
 266   case AMDGPU::MOV_IMM_F32:
 267     TII->buildMovImm(*BB, I, MI->getOperand(0).getReg(),
 268                      MI->getOperand(1).getFPImm()->getValueAPF()
 269                          .bitcastToAPInt().getZExtValue());
 270     break;
 271   case AMDGPU::MOV_IMM_I32:
 272     TII->buildMovImm(*BB, I, MI->getOperand(0).getReg(),
 273                      MI->getOperand(1).getImm());
 274     break;
 275   case AMDGPU::CONST_COPY: {
 276     MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, MI, AMDGPU::MOV,
 277         MI->getOperand(0).getReg(), AMDGPU::ALU_CONST);
 278     TII->setImmOperand(NewMI, AMDGPU::OpName::src0_sel,
 279         MI->getOperand(1).getImm());
 280     break;
 281   }
 282
 283   case AMDGPU::RAT_WRITE_CACHELESS_32_eg:
 284   case AMDGPU::RAT_WRITE_CACHELESS_64_eg:
 285   case AMDGPU::RAT_WRITE_CACHELESS_128_eg: {
 286     unsigned EOP = (std::next(I)->getOpcode() == AMDGPU::RETURN) ? 1 : 0;
 287
 288     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode()))
 289             .addOperand(MI->getOperand(0))
 290             .addOperand(MI->getOperand(1))
 291             .addImm(EOP); // Set End of program bit
 292     break;
 293   }
 294
 295   case AMDGPU::TXD: {
 296     unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
 297     unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
 298     MachineOperand &RID = MI->getOperand(4);
 299     MachineOperand &SID = MI->getOperand(5);
 300     unsigned TextureId = MI->getOperand(6).getImm();
 301     unsigned SrcX = 0, SrcY = 1, SrcZ = 2, SrcW = 3;
 302     unsigned CTX = 1, CTY = 1, CTZ = 1, CTW = 1;
 303
 304     switch (TextureId) {
 305     case 5: // Rect
 306       CTX = CTY = 0;
 307       break;
 308     case 6: // Shadow1D
 309       SrcW = SrcZ;
 310       break;
 311     case 7: // Shadow2D
 312       SrcW = SrcZ;
 313       break;
 314     case 8: // ShadowRect
 315       CTX = CTY = 0;
 316       SrcW = SrcZ;
 317       break;
 318     case 9: // 1DArray
 319       SrcZ = SrcY;
 320       CTZ = 0;
 321       break;
 322     case 10: // 2DArray
 323       CTZ = 0;
 324       break;
 325     case 11: // Shadow1DArray
 326       SrcZ = SrcY;
 327       CTZ = 0;
 328       break;
 329     case 12: // Shadow2DArray
 330       CTZ = 0;
 331       break;
 332     }
 333     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), T0)
 334             .addOperand(MI->getOperand(3))
 335             .addImm(SrcX)
 336             .addImm(SrcY)
 337             .addImm(SrcZ)
 338             .addImm(SrcW)
 339             .addImm(0)
 340             .addImm(0)
 341             .addImm(0)
 342             .addImm(0)
 343             .addImm(1)
 344             .addImm(2)
 345             .addImm(3)
 346             .addOperand(RID)
 347             .addOperand(SID)
 348             .addImm(CTX)
 349             .addImm(CTY)
 350             .addImm(CTZ)
 351             .addImm(CTW);
 352     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), T1)
 353             .addOperand(MI->getOperand(2))
 354             .addImm(SrcX)
 355             .addImm(SrcY)
 356             .addImm(SrcZ)
 357             .addImm(SrcW)
 358             .addImm(0)
 359             .addImm(0)
 360             .addImm(0)
 361             .addImm(0)
 362             .addImm(1)
 363             .addImm(2)
 364             .addImm(3)
 365             .addOperand(RID)
 366             .addOperand(SID)
 367             .addImm(CTX)
 368             .addImm(CTY)
 369             .addImm(CTZ)
 370             .addImm(CTW);
 371     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_G))
 372             .addOperand(MI->getOperand(0))
 373             .addOperand(MI->getOperand(1))
 374             .addImm(SrcX)
 375             .addImm(SrcY)
 376             .addImm(SrcZ)
 377             .addImm(SrcW)
 378             .addImm(0)
 379             .addImm(0)
 380             .addImm(0)
 381             .addImm(0)
 382             .addImm(1)
 383             .addImm(2)
 384             .addImm(3)
 385             .addOperand(RID)
 386             .addOperand(SID)
 387             .addImm(CTX)
 388             .addImm(CTY)
 389             .addImm(CTZ)
 390             .addImm(CTW)
 391             .addReg(T0, RegState::Implicit)
 392             .addReg(T1, RegState::Implicit);
 393     break;
 394   }
 395
 396   case AMDGPU::TXD_SHADOW: {
 397     unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
 398     unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
 399     MachineOperand &RID = MI->getOperand(4);
 400     MachineOperand &SID = MI->getOperand(5);
 401     unsigned TextureId = MI->getOperand(6).getImm();
 402     unsigned SrcX = 0, SrcY = 1, SrcZ = 2, SrcW = 3;
 403     unsigned CTX = 1, CTY = 1, CTZ = 1, CTW = 1;
 404
 405     switch (TextureId) {
 406     case 5: // Rect
 407       CTX = CTY = 0;
 408       break;
 409     case 6: // Shadow1D
 410       SrcW = SrcZ;
 411       break;
 412     case 7: // Shadow2D
 413       SrcW = SrcZ;
 414       break;
 415     case 8: // ShadowRect
 416       CTX = CTY = 0;
 417       SrcW = SrcZ;
 418       break;
 419     case 9: // 1DArray
 420       SrcZ = SrcY;
 421       CTZ = 0;
 422       break;
 423     case 10: // 2DArray
 424       CTZ = 0;
 425       break;
 426     case 11: // Shadow1DArray
 427       SrcZ = SrcY;
 428       CTZ = 0;
 429       break;
 430     case 12: // Shadow2DArray
 431       CTZ = 0;
 432       break;
 433     }
 434
 435     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), T0)
 436             .addOperand(MI->getOperand(3))
 437             .addImm(SrcX)
 438             .addImm(SrcY)
 439             .addImm(SrcZ)
 440             .addImm(SrcW)
 441             .addImm(0)
 442             .addImm(0)
 443             .addImm(0)
 444             .addImm(0)
 445             .addImm(1)
 446             .addImm(2)
 447             .addImm(3)
 448             .addOperand(RID)
 449             .addOperand(SID)
 450             .addImm(CTX)
 451             .addImm(CTY)
 452             .addImm(CTZ)
 453             .addImm(CTW);
 454     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), T1)
 455             .addOperand(MI->getOperand(2))
 456             .addImm(SrcX)
 457             .addImm(SrcY)
 458             .addImm(SrcZ)
 459             .addImm(SrcW)
 460             .addImm(0)
 461             .addImm(0)
 462             .addImm(0)
 463             .addImm(0)
 464             .addImm(1)
 465             .addImm(2)
 466             .addImm(3)
 467             .addOperand(RID)
 468             .addOperand(SID)
 469             .addImm(CTX)
 470             .addImm(CTY)
 471             .addImm(CTZ)
 472             .addImm(CTW);
 473     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_C_G))
 474             .addOperand(MI->getOperand(0))
 475             .addOperand(MI->getOperand(1))
 476             .addImm(SrcX)
 477             .addImm(SrcY)
 478             .addImm(SrcZ)
 479             .addImm(SrcW)
 480             .addImm(0)
 481             .addImm(0)
 482             .addImm(0)
 483             .addImm(0)
 484             .addImm(1)
 485             .addImm(2)
 486             .addImm(3)
 487             .addOperand(RID)
 488             .addOperand(SID)
 489             .addImm(CTX)
 490             .addImm(CTY)
 491             .addImm(CTZ)
 492             .addImm(CTW)
 493             .addReg(T0, RegState::Implicit)
 494             .addReg(T1, RegState::Implicit);
 495     break;
 496   }
 497
 498   case AMDGPU::BRANCH:
 499       BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP))
 500               .addOperand(MI->getOperand(0));
 501       break;
 502
 503   case AMDGPU::BRANCH_COND_f32: {
 504     MachineInstr *NewMI =
 505       BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X),
 506               AMDGPU::PREDICATE_BIT)
 507               .addOperand(MI->getOperand(1))
 508               .addImm(OPCODE_IS_NOT_ZERO)
 509               .addImm(0); // Flags
 510     TII->addFlag(NewMI, 0, MO_FLAG_PUSH);
 511     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND))
 512             .addOperand(MI->getOperand(0))
 513             .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
 514     break;
 515   }
 516
 517   case AMDGPU::BRANCH_COND_i32: {
 518     MachineInstr *NewMI =
 519       BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X),
 520             AMDGPU::PREDICATE_BIT)
 521             .addOperand(MI->getOperand(1))
 522             .addImm(OPCODE_IS_NOT_ZERO_INT)
 523             .addImm(0); // Flags
 524     TII->addFlag(NewMI, 0, MO_FLAG_PUSH);
 525     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND))
 526            .addOperand(MI->getOperand(0))
 527             .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
 528     break;
 529   }
 530
 531   case AMDGPU::EG_ExportSwz:
 532   case AMDGPU::R600_ExportSwz: {
 533     // Instruction is left unmodified if its not the last one of its type
 534     bool isLastInstructionOfItsType = true;
 535     unsigned InstExportType = MI->getOperand(1).getImm();
 536     for (MachineBasicBlock::iterator NextExportInst = std::next(I),
 537          EndBlock = BB->end(); NextExportInst != EndBlock;
 538          NextExportInst = std::next(NextExportInst)) {
 539       if (NextExportInst->getOpcode() == AMDGPU::EG_ExportSwz ||
 540           NextExportInst->getOpcode() == AMDGPU::R600_ExportSwz) {
 541         unsigned CurrentInstExportType = NextExportInst->getOperand(1)
 542             .getImm();
 543         if (CurrentInstExportType == InstExportType) {
 544           isLastInstructionOfItsType = false;
 545           break;
 546         }
 547       }
 548     }
 549     bool EOP = (std::next(I)->getOpcode() == AMDGPU::RETURN) ? 1 : 0;
 550     if (!EOP && !isLastInstructionOfItsType)
 551       return BB;
 552     unsigned CfInst = (MI->getOpcode() == AMDGPU::EG_ExportSwz)? 84 : 40;
 553     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode()))
 554             .addOperand(MI->getOperand(0))
 555             .addOperand(MI->getOperand(1))
 556             .addOperand(MI->getOperand(2))
 557             .addOperand(MI->getOperand(3))
 558             .addOperand(MI->getOperand(4))
 559             .addOperand(MI->getOperand(5))
 560             .addOperand(MI->getOperand(6))
 561             .addImm(CfInst)
 562             .addImm(EOP);
 563     break;
 564   }
 565   case AMDGPU::RETURN: {
 566     // RETURN instructions must have the live-out registers as implicit uses,
 567     // otherwise they appear dead.
 568     R600MachineFunctionInfo *MFI = MF->getInfo<R600MachineFunctionInfo>();
 569     MachineInstrBuilder MIB(*MF, MI);
 570     for (unsigned i = 0, e = MFI->LiveOuts.size(); i != e; ++i)
 571       MIB.addReg(MFI->LiveOuts[i], RegState::Implicit);
 572     return BB;
 573   }
 574   }
 575
 576   MI->eraseFromParent();
 577   return BB;
 578 }
 579
 580 //===----------------------------------------------------------------------===//
 581 // Custom DAG Lowering Operations
 582 //===----------------------------------------------------------------------===//
 583
 584 SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
 585   MachineFunction &MF = DAG.getMachineFunction();
 586   R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
 587   switch (Op.getOpcode()) {
 588   default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
 589   case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
 590   case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
 591   case ISD::SHL_PARTS: return LowerSHLParts(Op, DAG);
 592   case ISD::SRA_PARTS:
 593   case ISD::SRL_PARTS: return LowerSRXParts(Op, DAG);
 594   case ISD::UADDO: return LowerUADDSUBO(Op, DAG, ISD::ADD, AMDGPUISD::CARRY);
 595   case ISD::USUBO: return LowerUADDSUBO(Op, DAG, ISD::SUB, AMDGPUISD::BORROW);
 596   case ISD::FCOS:
 597   case ISD::FSIN: return LowerTrig(Op, DAG);
 598   case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
 599   case ISD::STORE: return LowerSTORE(Op, DAG);
 600   case ISD::LOAD: {
 601     SDValue Result = LowerLOAD(Op, DAG);
 602     assert((!Result.getNode() ||
 603             Result.getNode()->getNumValues() == 2) &&
 604            "Load should return a value and a chain");
 605     return Result;
 606   }
 607
 608   case ISD::BRCOND: return LowerBRCOND(Op, DAG);
 609   case ISD::GlobalAddress: return LowerGlobalAddress(MFI, Op, DAG);
 610   case ISD::INTRINSIC_VOID: {
 611     SDValue Chain = Op.getOperand(0);
 612     unsigned IntrinsicID =
 613                          cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
 614     switch (IntrinsicID) {
 615     case AMDGPUIntrinsic::AMDGPU_store_output: {
 616       int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
 617       unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister(RegIndex);
 618       MFI->LiveOuts.push_back(Reg);
 619       return DAG.getCopyToReg(Chain, SDLoc(Op), Reg, Op.getOperand(2));
 620     }
 621     case AMDGPUIntrinsic::R600_store_swizzle: {
 622       SDLoc DL(Op);
 623       const SDValue Args[8] = {
 624         Chain,
 625         Op.getOperand(2), // Export Value
 626         Op.getOperand(3), // ArrayBase
 627         Op.getOperand(4), // Type
 628         DAG.getConstant(0, DL, MVT::i32), // SWZ_X
 629         DAG.getConstant(1, DL, MVT::i32), // SWZ_Y
 630         DAG.getConstant(2, DL, MVT::i32), // SWZ_Z
 631         DAG.getConstant(3, DL, MVT::i32) // SWZ_W
 632       };
 633       return DAG.getNode(AMDGPUISD::EXPORT, DL, Op.getValueType(), Args);
 634     }
 635
 636     // default for switch(IntrinsicID)
 637     default: break;
 638     }
 639     // break out of case ISD::INTRINSIC_VOID in switch(Op.getOpcode())
 640     break;
 641   }
 642   case ISD::INTRINSIC_WO_CHAIN: {
 643     unsigned IntrinsicID =
 644                          cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
 645     EVT VT = Op.getValueType();
 646     SDLoc DL(Op);
 647     switch(IntrinsicID) {
 648     default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
 649     case AMDGPUIntrinsic::R600_load_input: {
 650       int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
 651       unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister(RegIndex);
 652       MachineFunction &MF = DAG.getMachineFunction();
 653       MachineRegisterInfo &MRI = MF.getRegInfo();
 654       MRI.addLiveIn(Reg);
 655       return DAG.getCopyFromReg(DAG.getEntryNode(),
 656           SDLoc(DAG.getEntryNode()), Reg, VT);
 657     }
 658
 659     case AMDGPUIntrinsic::R600_interp_input: {
 660       int slot = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
 661       int ijb = cast<ConstantSDNode>(Op.getOperand(2))->getSExtValue();
 662       MachineSDNode *interp;
 663       if (ijb < 0) {
 664         const R600InstrInfo *TII =
 665             static_cast<const R600InstrInfo *>(Subtarget->getInstrInfo());
 666         interp = DAG.getMachineNode(AMDGPU::INTERP_VEC_LOAD, DL,
 667             MVT::v4f32, DAG.getTargetConstant(slot / 4, DL, MVT::i32));
 668         return DAG.getTargetExtractSubreg(
 669             TII->getRegisterInfo().getSubRegFromChannel(slot % 4),
 670             DL, MVT::f32, SDValue(interp, 0));
 671       }
 672       MachineFunction &MF = DAG.getMachineFunction();
 673       MachineRegisterInfo &MRI = MF.getRegInfo();
 674       unsigned RegisterI = AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb);
 675       unsigned RegisterJ = AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb + 1);
 676       MRI.addLiveIn(RegisterI);
 677       MRI.addLiveIn(RegisterJ);
 678       SDValue RegisterINode = DAG.getCopyFromReg(DAG.getEntryNode(),
 679           SDLoc(DAG.getEntryNode()), RegisterI, MVT::f32);
 680       SDValue RegisterJNode = DAG.getCopyFromReg(DAG.getEntryNode(),
 681           SDLoc(DAG.getEntryNode()), RegisterJ, MVT::f32);
 682
 683       if (slot % 4 < 2)
 684         interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_XY, DL,
 685             MVT::f32, MVT::f32, DAG.getTargetConstant(slot / 4, DL, MVT::i32),
 686             RegisterJNode, RegisterINode);
 687       else
 688         interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_ZW, DL,
 689             MVT::f32, MVT::f32, DAG.getTargetConstant(slot / 4, DL, MVT::i32),
 690             RegisterJNode, RegisterINode);
 691       return SDValue(interp, slot % 2);
 692     }
 693     case AMDGPUIntrinsic::R600_interp_xy:
 694     case AMDGPUIntrinsic::R600_interp_zw: {
 695       int slot = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
 696       MachineSDNode *interp;
 697       SDValue RegisterINode = Op.getOperand(2);
 698       SDValue RegisterJNode = Op.getOperand(3);
 699
 700       if (IntrinsicID == AMDGPUIntrinsic::R600_interp_xy)
 701         interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_XY, DL,
 702             MVT::f32, MVT::f32, DAG.getTargetConstant(slot, DL, MVT::i32),
 703             RegisterJNode, RegisterINode);
 704       else
 705         interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_ZW, DL,
 706             MVT::f32, MVT::f32, DAG.getTargetConstant(slot, DL, MVT::i32),
 707             RegisterJNode, RegisterINode);
 708       return DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2f32,
 709           SDValue(interp, 0), SDValue(interp, 1));
 710     }
 711     case AMDGPUIntrinsic::R600_tex:
 712     case AMDGPUIntrinsic::R600_texc:
 713     case AMDGPUIntrinsic::R600_txl:
 714     case AMDGPUIntrinsic::R600_txlc:
 715     case AMDGPUIntrinsic::R600_txb:
 716     case AMDGPUIntrinsic::R600_txbc:
 717     case AMDGPUIntrinsic::R600_txf:
 718     case AMDGPUIntrinsic::R600_txq:
 719     case AMDGPUIntrinsic::R600_ddx:
 720     case AMDGPUIntrinsic::R600_ddy:
 721     case AMDGPUIntrinsic::R600_ldptr: {
 722       unsigned TextureOp;
 723       switch (IntrinsicID) {
 724       case AMDGPUIntrinsic::R600_tex:
 725         TextureOp = 0;
 726         break;
 727       case AMDGPUIntrinsic::R600_texc:
 728         TextureOp = 1;
 729         break;
 730       case AMDGPUIntrinsic::R600_txl:
 731         TextureOp = 2;
 732         break;
 733       case AMDGPUIntrinsic::R600_txlc:
 734         TextureOp = 3;
 735         break;
 736       case AMDGPUIntrinsic::R600_txb:
 737         TextureOp = 4;
 738         break;
 739       case AMDGPUIntrinsic::R600_txbc:
 740         TextureOp = 5;
 741         break;
 742       case AMDGPUIntrinsic::R600_txf:
 743         TextureOp = 6;
 744         break;
 745       case AMDGPUIntrinsic::R600_txq:
 746         TextureOp = 7;
 747         break;
 748       case AMDGPUIntrinsic::R600_ddx:
 749         TextureOp = 8;
 750         break;
 751       case AMDGPUIntrinsic::R600_ddy:
 752         TextureOp = 9;
 753         break;
 754       case AMDGPUIntrinsic::R600_ldptr:
 755         TextureOp = 10;
 756         break;
 757       default:
 758         llvm_unreachable("Unknow Texture Operation");
 759       }
 760
 761       SDValue TexArgs[19] = {
 762         DAG.getConstant(TextureOp, DL, MVT::i32),
 763         Op.getOperand(1),
 764         DAG.getConstant(0, DL, MVT::i32),
 765         DAG.getConstant(1, DL, MVT::i32),
 766         DAG.getConstant(2, DL, MVT::i32),
 767         DAG.getConstant(3, DL, MVT::i32),
 768         Op.getOperand(2),
 769         Op.getOperand(3),
 770         Op.getOperand(4),
 771         DAG.getConstant(0, DL, MVT::i32),
 772         DAG.getConstant(1, DL, MVT::i32),
 773         DAG.getConstant(2, DL, MVT::i32),
 774         DAG.getConstant(3, DL, MVT::i32),
 775         Op.getOperand(5),
 776         Op.getOperand(6),
 777         Op.getOperand(7),
 778         Op.getOperand(8),
 779         Op.getOperand(9),
 780         Op.getOperand(10)
 781       };
 782       return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, DL, MVT::v4f32, TexArgs);
 783     }
 784     case AMDGPUIntrinsic::AMDGPU_dp4: {
 785       SDValue Args[8] = {
 786       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
 787           DAG.getConstant(0, DL, MVT::i32)),
 788       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
 789           DAG.getConstant(0, DL, MVT::i32)),
 790       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
 791           DAG.getConstant(1, DL, MVT::i32)),
 792       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
 793           DAG.getConstant(1, DL, MVT::i32)),
 794       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
 795           DAG.getConstant(2, DL, MVT::i32)),
 796       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
 797           DAG.getConstant(2, DL, MVT::i32)),
 798       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
 799           DAG.getConstant(3, DL, MVT::i32)),
 800       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
 801           DAG.getConstant(3, DL, MVT::i32))
 802       };
 803       return DAG.getNode(AMDGPUISD::DOT4, DL, MVT::f32, Args);
 804     }
 805
 806     case Intrinsic::r600_read_ngroups_x:
 807       return LowerImplicitParameter(DAG, VT, DL, 0);
 808     case Intrinsic::r600_read_ngroups_y:
 809       return LowerImplicitParameter(DAG, VT, DL, 1);
 810     case Intrinsic::r600_read_ngroups_z:
 811       return LowerImplicitParameter(DAG, VT, DL, 2);
 812     case Intrinsic::r600_read_global_size_x:
 813       return LowerImplicitParameter(DAG, VT, DL, 3);
 814     case Intrinsic::r600_read_global_size_y:
 815       return LowerImplicitParameter(DAG, VT, DL, 4);
 816     case Intrinsic::r600_read_global_size_z:
 817       return LowerImplicitParameter(DAG, VT, DL, 5);
 818     case Intrinsic::r600_read_local_size_x:
 819       return LowerImplicitParameter(DAG, VT, DL, 6);
 820     case Intrinsic::r600_read_local_size_y:
 821       return LowerImplicitParameter(DAG, VT, DL, 7);
 822     case Intrinsic::r600_read_local_size_z:
 823       return LowerImplicitParameter(DAG, VT, DL, 8);
 824
 825     case Intrinsic::AMDGPU_read_workdim:
 826       return LowerImplicitParameter(DAG, VT, DL, MFI->ABIArgOffset / 4);
 827
 828     case Intrinsic::r600_read_tgid_x:
 829       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 830                                   AMDGPU::T1_X, VT);
 831     case Intrinsic::r600_read_tgid_y:
 832       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 833                                   AMDGPU::T1_Y, VT);
 834     case Intrinsic::r600_read_tgid_z:
 835       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 836                                   AMDGPU::T1_Z, VT);
 837     case Intrinsic::r600_read_tidig_x:
 838       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 839                                   AMDGPU::T0_X, VT);
 840     case Intrinsic::r600_read_tidig_y:
 841       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 842                                   AMDGPU::T0_Y, VT);
 843     case Intrinsic::r600_read_tidig_z:
 844       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 845                                   AMDGPU::T0_Z, VT);
 846     case Intrinsic::AMDGPU_rsq:
 847       // XXX - I'm assuming SI's RSQ_LEGACY matches R600's behavior.
 848       return DAG.getNode(AMDGPUISD::RSQ_LEGACY, DL, VT, Op.getOperand(1));
 849
 850     case AMDGPUIntrinsic::AMDGPU_fract:
 851     case AMDGPUIntrinsic::AMDIL_fraction: // Legacy name.
 852       return DAG.getNode(AMDGPUISD::FRACT, DL, VT, Op.getOperand(1));
 853     }
 854     // break out of case ISD::INTRINSIC_WO_CHAIN in switch(Op.getOpcode())
 855     break;
 856   }
 857   } // end switch(Op.getOpcode())
 858   return SDValue();
 859 }
 860
 861 void R600TargetLowering::ReplaceNodeResults(SDNode *N,
 862                                             SmallVectorImpl<SDValue> &Results,
 863                                             SelectionDAG &DAG) const {
 864   switch (N->getOpcode()) {
 865   default:
 866     AMDGPUTargetLowering::ReplaceNodeResults(N, Results, DAG);
 867     return;
 868   case ISD::FP_TO_UINT:
 869     if (N->getValueType(0) == MVT::i1) {
 870       Results.push_back(LowerFPTOUINT(N->getOperand(0), DAG));
 871       return;
 872     }
 873     // Fall-through. Since we don't care about out of bounds values
 874     // we can use FP_TO_SINT for uints too. The DAGLegalizer code for uint
 875     // considers some extra cases which are not necessary here.
 876   case ISD::FP_TO_SINT: {
 877     SDValue Result;
 878     if (expandFP_TO_SINT(N, Result, DAG))
 879       Results.push_back(Result);
 880     return;
 881   }
 882   case ISD::UDIV: {
 883     SDValue Op = SDValue(N, 0);
 884     SDLoc DL(Op);
 885     EVT VT = Op.getValueType();
 886     SDValue UDIVREM = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(VT, VT),
 887       N->getOperand(0), N->getOperand(1));
 888     Results.push_back(UDIVREM);
 889     break;
 890   }
 891   case ISD::UREM: {
 892     SDValue Op = SDValue(N, 0);
 893     SDLoc DL(Op);
 894     EVT VT = Op.getValueType();
 895     SDValue UDIVREM = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(VT, VT),
 896       N->getOperand(0), N->getOperand(1));
 897     Results.push_back(UDIVREM.getValue(1));
 898     break;
 899   }
 900   case ISD::SDIV: {
 901     SDValue Op = SDValue(N, 0);
 902     SDLoc DL(Op);
 903     EVT VT = Op.getValueType();
 904     SDValue SDIVREM = DAG.getNode(ISD::SDIVREM, DL, DAG.getVTList(VT, VT),
 905       N->getOperand(0), N->getOperand(1));
 906     Results.push_back(SDIVREM);
 907     break;
 908   }
 909   case ISD::SREM: {
 910     SDValue Op = SDValue(N, 0);
 911     SDLoc DL(Op);
 912     EVT VT = Op.getValueType();
 913     SDValue SDIVREM = DAG.getNode(ISD::SDIVREM, DL, DAG.getVTList(VT, VT),
 914       N->getOperand(0), N->getOperand(1));
 915     Results.push_back(SDIVREM.getValue(1));
 916     break;
 917   }
 918   case ISD::SDIVREM: {
 919     SDValue Op = SDValue(N, 1);
 920     SDValue RES = LowerSDIVREM(Op, DAG);
 921     Results.push_back(RES);
 922     Results.push_back(RES.getValue(1));
 923     break;
 924   }
 925   case ISD::UDIVREM: {
 926     SDValue Op = SDValue(N, 0);
 927     LowerUDIVREM64(Op, DAG, Results);
 928     break;
 929   }
 930   }
 931 }
 932
 933 SDValue R600TargetLowering::vectorToVerticalVector(SelectionDAG &DAG,
 934                                                    SDValue Vector) const {
 935
 936   SDLoc DL(Vector);
 937   EVT VecVT = Vector.getValueType();
 938   EVT EltVT = VecVT.getVectorElementType();
 939   SmallVector<SDValue, 8> Args;
 940
 941   for (unsigned i = 0, e = VecVT.getVectorNumElements();
 942                                                            i != e; ++i) {
 943     Args.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Vector,
 944                                DAG.getConstant(i, DL, getVectorIdxTy())));
 945   }
 946
 947   return DAG.getNode(AMDGPUISD::BUILD_VERTICAL_VECTOR, DL, VecVT, Args);
 948 }
 949
 950 SDValue R600TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
 951                                                     SelectionDAG &DAG) const {
 952
 953   SDLoc DL(Op);
 954   SDValue Vector = Op.getOperand(0);
 955   SDValue Index = Op.getOperand(1);
 956
 957   if (isa<ConstantSDNode>(Index) ||
 958       Vector.getOpcode() == AMDGPUISD::BUILD_VERTICAL_VECTOR)
 959     return Op;
 960
 961   Vector = vectorToVerticalVector(DAG, Vector);
 962   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Op.getValueType(),
 963                      Vector, Index);
 964 }
 965
 966 SDValue R600TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
 967                                                    SelectionDAG &DAG) const {
 968   SDLoc DL(Op);
 969   SDValue Vector = Op.getOperand(0);
 970   SDValue Value = Op.getOperand(1);
 971   SDValue Index = Op.getOperand(2);
 972
 973   if (isa<ConstantSDNode>(Index) ||
 974       Vector.getOpcode() == AMDGPUISD::BUILD_VERTICAL_VECTOR)
 975     return Op;
 976
 977   Vector = vectorToVerticalVector(DAG, Vector);
 978   SDValue Insert = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, Op.getValueType(),
 979                                Vector, Value, Index);
 980   return vectorToVerticalVector(DAG, Insert);
 981 }
 982
 983 SDValue R600TargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
 984   // On hw >= R700, COS/SIN input must be between -1. and 1.
 985   // Thus we lower them to TRIG ( FRACT ( x / 2Pi + 0.5) - 0.5)
 986   EVT VT = Op.getValueType();
 987   SDValue Arg = Op.getOperand(0);
 988   SDLoc DL(Op);
 989   SDValue FractPart = DAG.getNode(AMDGPUISD::FRACT, DL, VT,
 990       DAG.getNode(ISD::FADD, DL, VT,
 991         DAG.getNode(ISD::FMUL, DL, VT, Arg,
 992           DAG.getConstantFP(0.15915494309, DL, MVT::f32)),
 993         DAG.getConstantFP(0.5, DL, MVT::f32)));
 994   unsigned TrigNode;
 995   switch (Op.getOpcode()) {
 996   case ISD::FCOS:
 997     TrigNode = AMDGPUISD::COS_HW;
 998     break;
 999   case ISD::FSIN:
1000     TrigNode = AMDGPUISD::SIN_HW;
1001     break;
1002   default:
1003     llvm_unreachable("Wrong trig opcode");
1004   }
1005   SDValue TrigVal = DAG.getNode(TrigNode, DL, VT,
1006       DAG.getNode(ISD::FADD, DL, VT, FractPart,
1007         DAG.getConstantFP(-0.5, DL, MVT::f32)));
1008   if (Gen >= AMDGPUSubtarget::R700)
1009     return TrigVal;
1010   // On R600 hw, COS/SIN input must be between -Pi and Pi.
1011   return DAG.getNode(ISD::FMUL, DL, VT, TrigVal,
1012       DAG.getConstantFP(3.14159265359, DL, MVT::f32));
1013 }
1014
1015 SDValue R600TargetLowering::LowerSHLParts(SDValue Op, SelectionDAG &DAG) const {
1016   SDLoc DL(Op);
1017   EVT VT = Op.getValueType();
1018
1019   SDValue Lo = Op.getOperand(0);
1020   SDValue Hi = Op.getOperand(1);
1021   SDValue Shift = Op.getOperand(2);
1022   SDValue Zero = DAG.getConstant(0, DL, VT);
1023   SDValue One  = DAG.getConstant(1, DL, VT);
1024
1025   SDValue Width  = DAG.getConstant(VT.getSizeInBits(), DL, VT);
1026   SDValue Width1 = DAG.getConstant(VT.getSizeInBits() - 1, DL, VT);
1027   SDValue BigShift  = DAG.getNode(ISD::SUB, DL, VT, Shift, Width);
1028   SDValue CompShift = DAG.getNode(ISD::SUB, DL, VT, Width1, Shift);
1029
1030   // The dance around Width1 is necessary for 0 special case.
1031   // Without it the CompShift might be 32, producing incorrect results in
1032   // Overflow. So we do the shift in two steps, the alternative is to
1033   // add a conditional to filter the special case.
1034
1035   SDValue Overflow = DAG.getNode(ISD::SRL, DL, VT, Lo, CompShift);
1036   Overflow = DAG.getNode(ISD::SRL, DL, VT, Overflow, One);
1037
1038   SDValue HiSmall = DAG.getNode(ISD::SHL, DL, VT, Hi, Shift);
1039   HiSmall = DAG.getNode(ISD::OR, DL, VT, HiSmall, Overflow);
1040   SDValue LoSmall = DAG.getNode(ISD::SHL, DL, VT, Lo, Shift);
1041
1042   SDValue HiBig = DAG.getNode(ISD::SHL, DL, VT, Lo, BigShift);
1043   SDValue LoBig = Zero;
1044
1045   Hi = DAG.getSelectCC(DL, Shift, Width, HiSmall, HiBig, ISD::SETULT);
1046   Lo = DAG.getSelectCC(DL, Shift, Width, LoSmall, LoBig, ISD::SETULT);
1047
1048   return DAG.getNode(ISD::MERGE_VALUES, DL, DAG.getVTList(VT,VT), Lo, Hi);
1049 }
1050
1051 SDValue R600TargetLowering::LowerSRXParts(SDValue Op, SelectionDAG &DAG) const {
1052   SDLoc DL(Op);
1053   EVT VT = Op.getValueType();
1054
1055   SDValue Lo = Op.getOperand(0);
1056   SDValue Hi = Op.getOperand(1);
1057   SDValue Shift = Op.getOperand(2);
1058   SDValue Zero = DAG.getConstant(0, DL, VT);
1059   SDValue One  = DAG.getConstant(1, DL, VT);
1060
1061   const bool SRA = Op.getOpcode() == ISD::SRA_PARTS;
1062
1063   SDValue Width  = DAG.getConstant(VT.getSizeInBits(), DL, VT);
1064   SDValue Width1 = DAG.getConstant(VT.getSizeInBits() - 1, DL, VT);
1065   SDValue BigShift  = DAG.getNode(ISD::SUB, DL, VT, Shift, Width);
1066   SDValue CompShift = DAG.getNode(ISD::SUB, DL, VT, Width1, Shift);
1067
1068   // The dance around Width1 is necessary for 0 special case.
1069   // Without it the CompShift might be 32, producing incorrect results in
1070   // Overflow. So we do the shift in two steps, the alternative is to
1071   // add a conditional to filter the special case.
1072
1073   SDValue Overflow = DAG.getNode(ISD::SHL, DL, VT, Hi, CompShift);
1074   Overflow = DAG.getNode(ISD::SHL, DL, VT, Overflow, One);
1075
1076   SDValue HiSmall = DAG.getNode(SRA ? ISD::SRA : ISD::SRL, DL, VT, Hi, Shift);
1077   SDValue LoSmall = DAG.getNode(ISD::SRL, DL, VT, Lo, Shift);
1078   LoSmall = DAG.getNode(ISD::OR, DL, VT, LoSmall, Overflow);
1079
1080   SDValue LoBig = DAG.getNode(SRA ? ISD::SRA : ISD::SRL, DL, VT, Hi, BigShift);
1081   SDValue HiBig = SRA ? DAG.getNode(ISD::SRA, DL, VT, Hi, Width1) : Zero;
1082
1083   Hi = DAG.getSelectCC(DL, Shift, Width, HiSmall, HiBig, ISD::SETULT);
1084   Lo = DAG.getSelectCC(DL, Shift, Width, LoSmall, LoBig, ISD::SETULT);
1085
1086   return DAG.getNode(ISD::MERGE_VALUES, DL, DAG.getVTList(VT,VT), Lo, Hi);
1087 }
1088
1089 SDValue R600TargetLowering::LowerUADDSUBO(SDValue Op, SelectionDAG &DAG,
1090                                           unsigned mainop, unsigned ovf) const {
1091   SDLoc DL(Op);
1092   EVT VT = Op.getValueType();
1093
1094   SDValue Lo = Op.getOperand(0);
1095   SDValue Hi = Op.getOperand(1);
1096
1097   SDValue OVF = DAG.getNode(ovf, DL, VT, Lo, Hi);
1098   // Extend sign.
1099   OVF = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, OVF,
1100                     DAG.getValueType(MVT::i1));
1101
1102   SDValue Res = DAG.getNode(mainop, DL, VT, Lo, Hi);
1103
1104   return DAG.getNode(ISD::MERGE_VALUES, DL, DAG.getVTList(VT, VT), Res, OVF);
1105 }
1106
1107 SDValue R600TargetLowering::LowerFPTOUINT(SDValue Op, SelectionDAG &DAG) const {
1108   SDLoc DL(Op);
1109   return DAG.getNode(
1110       ISD::SETCC,
1111       DL,
1112       MVT::i1,
1113       Op, DAG.getConstantFP(0.0f, DL, MVT::f32),
1114       DAG.getCondCode(ISD::SETNE)
1115       );
1116 }
1117
1118 SDValue R600TargetLowering::LowerImplicitParameter(SelectionDAG &DAG, EVT VT,
1119                                                    SDLoc DL,
1120                                                    unsigned DwordOffset) const {
1121   unsigned ByteOffset = DwordOffset * 4;
1122   PointerType * PtrType = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
1123                                       AMDGPUAS::CONSTANT_BUFFER_0);
1124
1125   // We shouldn't be using an offset wider than 16-bits for implicit parameters.
1126   assert(isInt<16>(ByteOffset));
1127
1128   return DAG.getLoad(VT, DL, DAG.getEntryNode(),
1129                      DAG.getConstant(ByteOffset, DL, MVT::i32), // PTR
1130                      MachinePointerInfo(ConstantPointerNull::get(PtrType)),
1131                      false, false, false, 0);
1132 }
1133
1134 bool R600TargetLowering::isZero(SDValue Op) const {
1135   if(ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
1136     return Cst->isNullValue();
1137   } else if(ConstantFPSDNode *CstFP = dyn_cast<ConstantFPSDNode>(Op)){
1138     return CstFP->isZero();
1139   } else {
1140     return false;
1141   }
1142 }
1143
1144 SDValue R600TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
1145   SDLoc DL(Op);
1146   EVT VT = Op.getValueType();
1147
1148   SDValue LHS = Op.getOperand(0);
1149   SDValue RHS = Op.getOperand(1);
1150   SDValue True = Op.getOperand(2);
1151   SDValue False = Op.getOperand(3);
1152   SDValue CC = Op.getOperand(4);
1153   SDValue Temp;
1154
1155   if (VT == MVT::f32) {
1156     DAGCombinerInfo DCI(DAG, AfterLegalizeVectorOps, true, nullptr);
1157     SDValue MinMax = CombineFMinMaxLegacy(DL, VT, LHS, RHS, True, False, CC, DCI);
1158     if (MinMax)
1159       return MinMax;
1160   }
1161
1162   // LHS and RHS are guaranteed to be the same value type
1163   EVT CompareVT = LHS.getValueType();
1164
1165   // Check if we can lower this to a native operation.
1166
1167   // Try to lower to a SET* instruction:
1168   //
1169   // SET* can match the following patterns:
1170   //
1171   // select_cc f32, f32, -1,  0, cc_supported
1172   // select_cc f32, f32, 1.0f, 0.0f, cc_supported
1173   // select_cc i32, i32, -1,  0, cc_supported
1174   //
1175
1176   // Move hardware True/False values to the correct operand.
1177   ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
1178   ISD::CondCode InverseCC =
1179      ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32);
1180   if (isHWTrueValue(False) && isHWFalseValue(True)) {
1181     if (isCondCodeLegal(InverseCC, CompareVT.getSimpleVT())) {
1182       std::swap(False, True);
1183       CC = DAG.getCondCode(InverseCC);
1184     } else {
1185       ISD::CondCode SwapInvCC = ISD::getSetCCSwappedOperands(InverseCC);
1186       if (isCondCodeLegal(SwapInvCC, CompareVT.getSimpleVT())) {
1187         std::swap(False, True);
1188         std::swap(LHS, RHS);
1189         CC = DAG.getCondCode(SwapInvCC);
1190       }
1191     }
1192   }
1193
1194   if (isHWTrueValue(True) && isHWFalseValue(False) &&
1195       (CompareVT == VT || VT == MVT::i32)) {
1196     // This can be matched by a SET* instruction.
1197     return DAG.getNode(ISD::SELECT_CC, DL, VT, LHS, RHS, True, False, CC);
1198   }
1199
1200   // Try to lower to a CND* instruction:
1201   //
1202   // CND* can match the following patterns:
1203   //
1204   // select_cc f32, 0.0, f32, f32, cc_supported
1205   // select_cc f32, 0.0, i32, i32, cc_supported
1206   // select_cc i32, 0,   f32, f32, cc_supported
1207   // select_cc i32, 0,   i32, i32, cc_supported
1208   //
1209
1210   // Try to move the zero value to the RHS
1211   if (isZero(LHS)) {
1212     ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
1213     // Try swapping the operands
1214     ISD::CondCode CCSwapped = ISD::getSetCCSwappedOperands(CCOpcode);
1215     if (isCondCodeLegal(CCSwapped, CompareVT.getSimpleVT())) {
1216       std::swap(LHS, RHS);
1217       CC = DAG.getCondCode(CCSwapped);
1218     } else {
1219       // Try inverting the conditon and then swapping the operands
1220       ISD::CondCode CCInv = ISD::getSetCCInverse(CCOpcode, CompareVT.isInteger());
1221       CCSwapped = ISD::getSetCCSwappedOperands(CCInv);
1222       if (isCondCodeLegal(CCSwapped, CompareVT.getSimpleVT())) {
1223         std::swap(True, False);
1224         std::swap(LHS, RHS);
1225         CC = DAG.getCondCode(CCSwapped);
1226       }
1227     }
1228   }
1229   if (isZero(RHS)) {
1230     SDValue Cond = LHS;
1231     SDValue Zero = RHS;
1232     ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
1233     if (CompareVT != VT) {
1234       // Bitcast True / False to the correct types.  This will end up being
1235       // a nop, but it allows us to define only a single pattern in the
1236       // .TD files for each CND* instruction rather than having to have
1237       // one pattern for integer True/False and one for fp True/False
1238       True = DAG.getNode(ISD::BITCAST, DL, CompareVT, True);
1239       False = DAG.getNode(ISD::BITCAST, DL, CompareVT, False);
1240     }
1241
1242     switch (CCOpcode) {
1243     case ISD::SETONE:
1244     case ISD::SETUNE:
1245     case ISD::SETNE:
1246       CCOpcode = ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32);
1247       Temp = True;
1248       True = False;
1249       False = Temp;
1250       break;
1251     default:
1252       break;
1253     }
1254     SDValue SelectNode = DAG.getNode(ISD::SELECT_CC, DL, CompareVT,
1255         Cond, Zero,
1256         True, False,
1257         DAG.getCondCode(CCOpcode));
1258     return DAG.getNode(ISD::BITCAST, DL, VT, SelectNode);
1259   }
1260
1261   // If we make it this for it means we have no native instructions to handle
1262   // this SELECT_CC, so we must lower it.
1263   SDValue HWTrue, HWFalse;
1264
1265   if (CompareVT == MVT::f32) {
1266     HWTrue = DAG.getConstantFP(1.0f, DL, CompareVT);
1267     HWFalse = DAG.getConstantFP(0.0f, DL, CompareVT);
1268   } else if (CompareVT == MVT::i32) {
1269     HWTrue = DAG.getConstant(-1, DL, CompareVT);
1270     HWFalse = DAG.getConstant(0, DL, CompareVT);
1271   }
1272   else {
1273     llvm_unreachable("Unhandled value type in LowerSELECT_CC");
1274   }
1275
1276   // Lower this unsupported SELECT_CC into a combination of two supported
1277   // SELECT_CC operations.
1278   SDValue Cond = DAG.getNode(ISD::SELECT_CC, DL, CompareVT, LHS, RHS, HWTrue, HWFalse, CC);
1279
1280   return DAG.getNode(ISD::SELECT_CC, DL, VT,
1281       Cond, HWFalse,
1282       True, False,
1283       DAG.getCondCode(ISD::SETNE));
1284 }
1285
1286 /// LLVM generates byte-addressed pointers.  For indirect addressing, we need to
1287 /// convert these pointers to a register index.  Each register holds
1288 /// 16 bytes, (4 x 32bit sub-register), but we need to take into account the
1289 /// \p StackWidth, which tells us how many of the 4 sub-registrers will be used
1290 /// for indirect addressing.
1291 SDValue R600TargetLowering::stackPtrToRegIndex(SDValue Ptr,
1292                                                unsigned StackWidth,
1293                                                SelectionDAG &DAG) const {
1294   unsigned SRLPad;
1295   switch(StackWidth) {
1296   case 1:
1297     SRLPad = 2;
1298     break;
1299   case 2:
1300     SRLPad = 3;
1301     break;
1302   case 4:
1303     SRLPad = 4;
1304     break;
1305   default: llvm_unreachable("Invalid stack width");
1306   }
1307
1308   SDLoc DL(Ptr);
1309   return DAG.getNode(ISD::SRL, DL, Ptr.getValueType(), Ptr,
1310                      DAG.getConstant(SRLPad, DL, MVT::i32));
1311 }
1312
1313 void R600TargetLowering::getStackAddress(unsigned StackWidth,
1314                                          unsigned ElemIdx,
1315                                          unsigned &Channel,
1316                                          unsigned &PtrIncr) const {
1317   switch (StackWidth) {
1318   default:
1319   case 1:
1320     Channel = 0;
1321     if (ElemIdx > 0) {
1322       PtrIncr = 1;
1323     } else {
1324       PtrIncr = 0;
1325     }
1326     break;
1327   case 2:
1328     Channel = ElemIdx % 2;
1329     if (ElemIdx == 2) {
1330       PtrIncr = 1;
1331     } else {
1332       PtrIncr = 0;
1333     }
1334     break;
1335   case 4:
1336     Channel = ElemIdx;
1337     PtrIncr = 0;
1338     break;
1339   }
1340 }
1341
1342 SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
1343   SDLoc DL(Op);
1344   StoreSDNode *StoreNode = cast<StoreSDNode>(Op);
1345   SDValue Chain = Op.getOperand(0);
1346   SDValue Value = Op.getOperand(1);
1347   SDValue Ptr = Op.getOperand(2);
1348
1349   SDValue Result = AMDGPUTargetLowering::LowerSTORE(Op, DAG);
1350   if (Result.getNode()) {
1351     return Result;
1352   }
1353
1354   if (StoreNode->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS) {
1355     if (StoreNode->isTruncatingStore()) {
1356       EVT VT = Value.getValueType();
1357       assert(VT.bitsLE(MVT::i32));
1358       EVT MemVT = StoreNode->getMemoryVT();
1359       SDValue MaskConstant;
1360       if (MemVT == MVT::i8) {
1361         MaskConstant = DAG.getConstant(0xFF, DL, MVT::i32);
1362       } else {
1363         assert(MemVT == MVT::i16);
1364         MaskConstant = DAG.getConstant(0xFFFF, DL, MVT::i32);
1365       }
1366       SDValue DWordAddr = DAG.getNode(ISD::SRL, DL, VT, Ptr,
1367                                       DAG.getConstant(2, DL, MVT::i32));
1368       SDValue ByteIndex = DAG.getNode(ISD::AND, DL, Ptr.getValueType(), Ptr,
1369                                       DAG.getConstant(0x00000003, DL, VT));
1370       SDValue TruncValue = DAG.getNode(ISD::AND, DL, VT, Value, MaskConstant);
1371       SDValue Shift = DAG.getNode(ISD::SHL, DL, VT, ByteIndex,
1372                                    DAG.getConstant(3, DL, VT));
1373       SDValue ShiftedValue = DAG.getNode(ISD::SHL, DL, VT, TruncValue, Shift);
1374       SDValue Mask = DAG.getNode(ISD::SHL, DL, VT, MaskConstant, Shift);
1375       // XXX: If we add a 64-bit ZW register class, then we could use a 2 x i32
1376       // vector instead.
1377       SDValue Src[4] = {
1378         ShiftedValue,
1379         DAG.getConstant(0, DL, MVT::i32),
1380         DAG.getConstant(0, DL, MVT::i32),
1381         Mask
1382       };
1383       SDValue Input = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v4i32, Src);
1384       SDValue Args[3] = { Chain, Input, DWordAddr };
1385       return DAG.getMemIntrinsicNode(AMDGPUISD::STORE_MSKOR, DL,
1386                                      Op->getVTList(), Args, MemVT,
1387                                      StoreNode->getMemOperand());
1388     } else if (Ptr->getOpcode() != AMDGPUISD::DWORDADDR &&
1389                Value.getValueType().bitsGE(MVT::i32)) {
1390       // Convert pointer from byte address to dword address.
1391       Ptr = DAG.getNode(AMDGPUISD::DWORDADDR, DL, Ptr.getValueType(),
1392                         DAG.getNode(ISD::SRL, DL, Ptr.getValueType(),
1393                                     Ptr, DAG.getConstant(2, DL, MVT::i32)));
1394
1395       if (StoreNode->isTruncatingStore() || StoreNode->isIndexed()) {
1396         llvm_unreachable("Truncated and indexed stores not supported yet");
1397       } else {
1398         Chain = DAG.getStore(Chain, DL, Value, Ptr, StoreNode->getMemOperand());
1399       }
1400       return Chain;
1401     }
1402   }
1403
1404   EVT ValueVT = Value.getValueType();
1405
1406   if (StoreNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) {
1407     return SDValue();
1408   }
1409
1410   SDValue Ret = AMDGPUTargetLowering::LowerSTORE(Op, DAG);
1411   if (Ret.getNode()) {
1412     return Ret;
1413   }
1414   // Lowering for indirect addressing
1415
1416   const MachineFunction &MF = DAG.getMachineFunction();
1417   const AMDGPUFrameLowering *TFL =
1418       static_cast<const AMDGPUFrameLowering *>(Subtarget->getFrameLowering());
1419   unsigned StackWidth = TFL->getStackWidth(MF);
1420
1421   Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG);
1422
1423   if (ValueVT.isVector()) {
1424     unsigned NumElemVT = ValueVT.getVectorNumElements();
1425     EVT ElemVT = ValueVT.getVectorElementType();
1426     SmallVector<SDValue, 4> Stores(NumElemVT);
1427
1428     assert(NumElemVT >= StackWidth && "Stack width cannot be greater than "
1429                                       "vector width in load");
1430
1431     for (unsigned i = 0; i < NumElemVT; ++i) {
1432       unsigned Channel, PtrIncr;
1433       getStackAddress(StackWidth, i, Channel, PtrIncr);
1434       Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr,
1435                         DAG.getConstant(PtrIncr, DL, MVT::i32));
1436       SDValue Elem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ElemVT,
1437                                  Value, DAG.getConstant(i, DL, MVT::i32));
1438
1439       Stores[i] = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other,
1440                               Chain, Elem, Ptr,
1441                               DAG.getTargetConstant(Channel, DL, MVT::i32));
1442     }
1443      Chain =  DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores);
1444    } else {
1445     if (ValueVT == MVT::i8) {
1446       Value = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Value);
1447     }
1448     Chain = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other, Chain, Value, Ptr,
1449     DAG.getTargetConstant(0, DL, MVT::i32)); // Channel
1450   }
1451
1452   return Chain;
1453 }
1454
1455 // return (512 + (kc_bank << 12)
1456 static int
1457 ConstantAddressBlock(unsigned AddressSpace) {
1458   switch (AddressSpace) {
1459   case AMDGPUAS::CONSTANT_BUFFER_0:
1460     return 512;
1461   case AMDGPUAS::CONSTANT_BUFFER_1:
1462     return 512 + 4096;
1463   case AMDGPUAS::CONSTANT_BUFFER_2:
1464     return 512 + 4096 * 2;
1465   case AMDGPUAS::CONSTANT_BUFFER_3:
1466     return 512 + 4096 * 3;
1467   case AMDGPUAS::CONSTANT_BUFFER_4:
1468     return 512 + 4096 * 4;
1469   case AMDGPUAS::CONSTANT_BUFFER_5:
1470     return 512 + 4096 * 5;
1471   case AMDGPUAS::CONSTANT_BUFFER_6:
1472     return 512 + 4096 * 6;
1473   case AMDGPUAS::CONSTANT_BUFFER_7:
1474     return 512 + 4096 * 7;
1475   case AMDGPUAS::CONSTANT_BUFFER_8:
1476     return 512 + 4096 * 8;
1477   case AMDGPUAS::CONSTANT_BUFFER_9:
1478     return 512 + 4096 * 9;
1479   case AMDGPUAS::CONSTANT_BUFFER_10:
1480     return 512 + 4096 * 10;
1481   case AMDGPUAS::CONSTANT_BUFFER_11:
1482     return 512 + 4096 * 11;
1483   case AMDGPUAS::CONSTANT_BUFFER_12:
1484     return 512 + 4096 * 12;
1485   case AMDGPUAS::CONSTANT_BUFFER_13:
1486     return 512 + 4096 * 13;
1487   case AMDGPUAS::CONSTANT_BUFFER_14:
1488     return 512 + 4096 * 14;
1489   case AMDGPUAS::CONSTANT_BUFFER_15:
1490     return 512 + 4096 * 15;
1491   default:
1492     return -1;
1493   }
1494 }
1495
1496 SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const
1497 {
1498   EVT VT = Op.getValueType();
1499   SDLoc DL(Op);
1500   LoadSDNode *LoadNode = cast<LoadSDNode>(Op);
1501   SDValue Chain = Op.getOperand(0);
1502   SDValue Ptr = Op.getOperand(1);
1503   SDValue LoweredLoad;
1504
1505   SDValue Ret = AMDGPUTargetLowering::LowerLOAD(Op, DAG);
1506   if (Ret.getNode()) {
1507     SDValue Ops[2] = {
1508       Ret,
1509       Chain
1510     };
1511     return DAG.getMergeValues(Ops, DL);
1512   }
1513
1514   // Lower loads constant address space global variable loads
1515   if (LoadNode->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS &&
1516       isa<GlobalVariable>(GetUnderlyingObject(
1517           LoadNode->getMemOperand()->getValue(), *getDataLayout()))) {
1518
1519     SDValue Ptr = DAG.getZExtOrTrunc(LoadNode->getBasePtr(), DL,
1520         getPointerTy(AMDGPUAS::PRIVATE_ADDRESS));
1521     Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr,
1522         DAG.getConstant(2, DL, MVT::i32));
1523     return DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, Op->getVTList(),
1524                        LoadNode->getChain(), Ptr,
1525                        DAG.getTargetConstant(0, DL, MVT::i32),
1526                        Op.getOperand(2));
1527   }
1528
1529   if (LoadNode->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS && VT.isVector()) {
1530     SDValue MergedValues[2] = {
1531       ScalarizeVectorLoad(Op, DAG),
1532       Chain
1533     };
1534     return DAG.getMergeValues(MergedValues, DL);
1535   }
1536
1537   int ConstantBlock = ConstantAddressBlock(LoadNode->getAddressSpace());
1538   if (ConstantBlock > -1 &&
1539       ((LoadNode->getExtensionType() == ISD::NON_EXTLOAD) ||
1540        (LoadNode->getExtensionType() == ISD::ZEXTLOAD))) {
1541     SDValue Result;
1542     if (isa<ConstantExpr>(LoadNode->getMemOperand()->getValue()) ||
1543         isa<Constant>(LoadNode->getMemOperand()->getValue()) ||
1544         isa<ConstantSDNode>(Ptr)) {
1545       SDValue Slots[4];
1546       for (unsigned i = 0; i < 4; i++) {
1547         // We want Const position encoded with the following formula :
1548         // (((512 + (kc_bank << 12) + const_index) << 2) + chan)
1549         // const_index is Ptr computed by llvm using an alignment of 16.
1550         // Thus we add (((512 + (kc_bank << 12)) + chan ) * 4 here and
1551         // then div by 4 at the ISel step
1552         SDValue NewPtr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
1553             DAG.getConstant(4 * i + ConstantBlock * 16, DL, MVT::i32));
1554         Slots[i] = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::i32, NewPtr);
1555       }
1556       EVT NewVT = MVT::v4i32;
1557       unsigned NumElements = 4;
1558       if (VT.isVector()) {
1559         NewVT = VT;
1560         NumElements = VT.getVectorNumElements();
1561       }
1562       Result = DAG.getNode(ISD::BUILD_VECTOR, DL, NewVT,
1563                            makeArrayRef(Slots, NumElements));
1564     } else {
1565       // non-constant ptr can't be folded, keeps it as a v4f32 load
1566       Result = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::v4i32,
1567           DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr,
1568                       DAG.getConstant(4, DL, MVT::i32)),
1569                       DAG.getConstant(LoadNode->getAddressSpace() -
1570                                       AMDGPUAS::CONSTANT_BUFFER_0, DL, MVT::i32)
1571           );
1572     }
1573
1574     if (!VT.isVector()) {
1575       Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Result,
1576                            DAG.getConstant(0, DL, MVT::i32));
1577     }
1578
1579     SDValue MergedValues[2] = {
1580       Result,
1581       Chain
1582     };
1583     return DAG.getMergeValues(MergedValues, DL);
1584   }
1585
1586   // For most operations returning SDValue() will result in the node being
1587   // expanded by the DAG Legalizer. This is not the case for ISD::LOAD, so we
1588   // need to manually expand loads that may be legal in some address spaces and
1589   // illegal in others. SEXT loads from CONSTANT_BUFFER_0 are supported for
1590   // compute shaders, since the data is sign extended when it is uploaded to the
1591   // buffer. However SEXT loads from other address spaces are not supported, so
1592   // we need to expand them here.
1593   if (LoadNode->getExtensionType() == ISD::SEXTLOAD) {
1594     EVT MemVT = LoadNode->getMemoryVT();
1595     assert(!MemVT.isVector() && (MemVT == MVT::i16 || MemVT == MVT::i8));
1596     SDValue ShiftAmount =
1597           DAG.getConstant(VT.getSizeInBits() - MemVT.getSizeInBits(), DL,
1598                           MVT::i32);
1599     SDValue NewLoad = DAG.getExtLoad(ISD::EXTLOAD, DL, VT, Chain, Ptr,
1600                                   LoadNode->getPointerInfo(), MemVT,
1601                                   LoadNode->isVolatile(),
1602                                   LoadNode->isNonTemporal(),
1603                                   LoadNode->isInvariant(),
1604                                   LoadNode->getAlignment());
1605     SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, NewLoad, ShiftAmount);
1606     SDValue Sra = DAG.getNode(ISD::SRA, DL, VT, Shl, ShiftAmount);
1607
1608     SDValue MergedValues[2] = { Sra, Chain };
1609     return DAG.getMergeValues(MergedValues, DL);
1610   }
1611
1612   if (LoadNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) {
1613     return SDValue();
1614   }
1615
1616   // Lowering for indirect addressing
1617   const MachineFunction &MF = DAG.getMachineFunction();
1618   const AMDGPUFrameLowering *TFL =
1619       static_cast<const AMDGPUFrameLowering *>(Subtarget->getFrameLowering());
1620   unsigned StackWidth = TFL->getStackWidth(MF);
1621
1622   Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG);
1623
1624   if (VT.isVector()) {
1625     unsigned NumElemVT = VT.getVectorNumElements();
1626     EVT ElemVT = VT.getVectorElementType();
1627     SDValue Loads[4];
1628
1629     assert(NumElemVT >= StackWidth && "Stack width cannot be greater than "
1630                                       "vector width in load");
1631
1632     for (unsigned i = 0; i < NumElemVT; ++i) {
1633       unsigned Channel, PtrIncr;
1634       getStackAddress(StackWidth, i, Channel, PtrIncr);
1635       Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr,
1636                         DAG.getConstant(PtrIncr, DL, MVT::i32));
1637       Loads[i] = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, ElemVT,
1638                              Chain, Ptr,
1639                              DAG.getTargetConstant(Channel, DL, MVT::i32),
1640                              Op.getOperand(2));
1641     }
1642     for (unsigned i = NumElemVT; i < 4; ++i) {
1643       Loads[i] = DAG.getUNDEF(ElemVT);
1644     }
1645     EVT TargetVT = EVT::getVectorVT(*DAG.getContext(), ElemVT, 4);
1646     LoweredLoad = DAG.getNode(ISD::BUILD_VECTOR, DL, TargetVT, Loads);
1647   } else {
1648     LoweredLoad = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, VT,
1649                               Chain, Ptr,
1650                               DAG.getTargetConstant(0, DL, MVT::i32), // Channel
1651                               Op.getOperand(2));
1652   }
1653
1654   SDValue Ops[2] = {
1655     LoweredLoad,
1656     Chain
1657   };
1658
1659   return DAG.getMergeValues(Ops, DL);
1660 }
1661
1662 SDValue R600TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
1663   SDValue Chain = Op.getOperand(0);
1664   SDValue Cond  = Op.getOperand(1);
1665   SDValue Jump  = Op.getOperand(2);
1666
1667   return DAG.getNode(AMDGPUISD::BRANCH_COND, SDLoc(Op), Op.getValueType(),
1668                      Chain, Jump, Cond);
1669 }
1670
1671 /// XXX Only kernel functions are supported, so we can assume for now that
1672 /// every function is a kernel function, but in the future we should use
1673 /// separate calling conventions for kernel and non-kernel functions.
1674 SDValue R600TargetLowering::LowerFormalArguments(
1675                                       SDValue Chain,
1676                                       CallingConv::ID CallConv,
1677                                       bool isVarArg,
1678                                       const SmallVectorImpl<ISD::InputArg> &Ins,
1679                                       SDLoc DL, SelectionDAG &DAG,
1680                                       SmallVectorImpl<SDValue> &InVals) const {
1681   SmallVector<CCValAssign, 16> ArgLocs;
1682   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
1683                  *DAG.getContext());
1684   MachineFunction &MF = DAG.getMachineFunction();
1685   R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
1686
1687   SmallVector<ISD::InputArg, 8> LocalIns;
1688
1689   getOriginalFunctionArgs(DAG, MF.getFunction(), Ins, LocalIns);
1690
1691   AnalyzeFormalArguments(CCInfo, LocalIns);
1692
1693   for (unsigned i = 0, e = Ins.size(); i < e; ++i) {
1694     CCValAssign &VA = ArgLocs[i];
1695     const ISD::InputArg &In = Ins[i];
1696     EVT VT = In.VT;
1697     EVT MemVT = VA.getLocVT();
1698     if (!VT.isVector() && MemVT.isVector()) {
1699       // Get load source type if scalarized.
1700       MemVT = MemVT.getVectorElementType();
1701     }
1702
1703     if (MFI->getShaderType() != ShaderType::COMPUTE) {
1704       unsigned Reg = MF.addLiveIn(VA.getLocReg(), &AMDGPU::R600_Reg128RegClass);
1705       SDValue Register = DAG.getCopyFromReg(Chain, DL, Reg, VT);
1706       InVals.push_back(Register);
1707       continue;
1708     }
1709
1710     PointerType *PtrTy = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
1711                                           AMDGPUAS::CONSTANT_BUFFER_0);
1712
1713     // i64 isn't a legal type, so the register type used ends up as i32, which
1714     // isn't expected here. It attempts to create this sextload, but it ends up
1715     // being invalid. Somehow this seems to work with i64 arguments, but breaks
1716     // for <1 x i64>.
1717
1718     // The first 36 bytes of the input buffer contains information about
1719     // thread group and global sizes.
1720     ISD::LoadExtType Ext = ISD::NON_EXTLOAD;
1721     if (MemVT.getScalarSizeInBits() != VT.getScalarSizeInBits()) {
1722       // FIXME: This should really check the extload type, but the handling of
1723       // extload vector parameters seems to be broken.
1724
1725       // Ext = In.Flags.isSExt() ? ISD::SEXTLOAD : ISD::ZEXTLOAD;
1726       Ext = ISD::SEXTLOAD;
1727     }
1728
1729     // Compute the offset from the value.
1730     // XXX - I think PartOffset should give you this, but it seems to give the
1731     // size of the register which isn't useful.
1732
1733     unsigned ValBase = ArgLocs[In.getOrigArgIndex()].getLocMemOffset();
1734     unsigned PartOffset = VA.getLocMemOffset();
1735     unsigned Offset = 36 + VA.getLocMemOffset();
1736
1737     MachinePointerInfo PtrInfo(UndefValue::get(PtrTy), PartOffset - ValBase);
1738     SDValue Arg = DAG.getLoad(ISD::UNINDEXED, Ext, VT, DL, Chain,
1739                               DAG.getConstant(Offset, DL, MVT::i32),
1740                               DAG.getUNDEF(MVT::i32),
1741                               PtrInfo,
1742                               MemVT, false, true, true, 4);
1743
1744     // 4 is the preferred alignment for the CONSTANT memory space.
1745     InVals.push_back(Arg);
1746     MFI->ABIArgOffset = Offset + MemVT.getStoreSize();
1747   }
1748   return Chain;
1749 }
1750
1751 EVT R600TargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const {
1752    if (!VT.isVector())
1753      return MVT::i32;
1754    return VT.changeVectorElementTypeToInteger();
1755 }
1756
1757 static SDValue CompactSwizzlableVector(
1758   SelectionDAG &DAG, SDValue VectorEntry,
1759   DenseMap<unsigned, unsigned> &RemapSwizzle) {
1760   assert(VectorEntry.getOpcode() == ISD::BUILD_VECTOR);
1761   assert(RemapSwizzle.empty());
1762   SDValue NewBldVec[4] = {
1763     VectorEntry.getOperand(0),
1764     VectorEntry.getOperand(1),
1765     VectorEntry.getOperand(2),
1766     VectorEntry.getOperand(3)
1767   };
1768
1769   for (unsigned i = 0; i < 4; i++) {
1770     if (NewBldVec[i].getOpcode() == ISD::UNDEF)
1771       // We mask write here to teach later passes that the ith element of this
1772       // vector is undef. Thus we can use it to reduce 128 bits reg usage,
1773       // break false dependencies and additionnaly make assembly easier to read.
1774       RemapSwizzle[i] = 7; // SEL_MASK_WRITE
1775     if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(NewBldVec[i])) {
1776       if (C->isZero()) {
1777         RemapSwizzle[i] = 4; // SEL_0
1778         NewBldVec[i] = DAG.getUNDEF(MVT::f32);
1779       } else if (C->isExactlyValue(1.0)) {
1780         RemapSwizzle[i] = 5; // SEL_1
1781         NewBldVec[i] = DAG.getUNDEF(MVT::f32);
1782       }
1783     }
1784
1785     if (NewBldVec[i].getOpcode() == ISD::UNDEF)
1786       continue;
1787     for (unsigned j = 0; j < i; j++) {
1788       if (NewBldVec[i] == NewBldVec[j]) {
1789         NewBldVec[i] = DAG.getUNDEF(NewBldVec[i].getValueType());
1790         RemapSwizzle[i] = j;
1791         break;
1792       }
1793     }
1794   }
1795
1796   return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(VectorEntry),
1797                      VectorEntry.getValueType(), NewBldVec);
1798 }
1799
1800 static SDValue ReorganizeVector(SelectionDAG &DAG, SDValue VectorEntry,
1801                                 DenseMap<unsigned, unsigned> &RemapSwizzle) {
1802   assert(VectorEntry.getOpcode() == ISD::BUILD_VECTOR);
1803   assert(RemapSwizzle.empty());
1804   SDValue NewBldVec[4] = {
1805       VectorEntry.getOperand(0),
1806       VectorEntry.getOperand(1),
1807       VectorEntry.getOperand(2),
1808       VectorEntry.getOperand(3)
1809   };
1810   bool isUnmovable[4] = { false, false, false, false };
1811   for (unsigned i = 0; i < 4; i++) {
1812     RemapSwizzle[i] = i;
1813     if (NewBldVec[i].getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
1814       unsigned Idx = dyn_cast<ConstantSDNode>(NewBldVec[i].getOperand(1))
1815           ->getZExtValue();
1816       if (i == Idx)
1817         isUnmovable[Idx] = true;
1818     }
1819   }
1820
1821   for (unsigned i = 0; i < 4; i++) {
1822     if (NewBldVec[i].getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
1823       unsigned Idx = dyn_cast<ConstantSDNode>(NewBldVec[i].getOperand(1))
1824           ->getZExtValue();
1825       if (isUnmovable[Idx])
1826         continue;
1827       // Swap i and Idx
1828       std::swap(NewBldVec[Idx], NewBldVec[i]);
1829       std::swap(RemapSwizzle[i], RemapSwizzle[Idx]);
1830       break;
1831     }
1832   }
1833
1834   return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(VectorEntry),
1835                      VectorEntry.getValueType(), NewBldVec);
1836 }
1837
1838
1839 SDValue R600TargetLowering::OptimizeSwizzle(SDValue BuildVector,
1840                                             SDValue Swz[4], SelectionDAG &DAG,
1841                                             SDLoc DL) const {
1842   assert(BuildVector.getOpcode() == ISD::BUILD_VECTOR);
1843   // Old -> New swizzle values
1844   DenseMap<unsigned, unsigned> SwizzleRemap;
1845
1846   BuildVector = CompactSwizzlableVector(DAG, BuildVector, SwizzleRemap);
1847   for (unsigned i = 0; i < 4; i++) {
1848     unsigned Idx = cast<ConstantSDNode>(Swz[i])->getZExtValue();
1849     if (SwizzleRemap.find(Idx) != SwizzleRemap.end())
1850       Swz[i] = DAG.getConstant(SwizzleRemap[Idx], DL, MVT::i32);
1851   }
1852
1853   SwizzleRemap.clear();
1854   BuildVector = ReorganizeVector(DAG, BuildVector, SwizzleRemap);
1855   for (unsigned i = 0; i < 4; i++) {
1856     unsigned Idx = cast<ConstantSDNode>(Swz[i])->getZExtValue();
1857     if (SwizzleRemap.find(Idx) != SwizzleRemap.end())
1858       Swz[i] = DAG.getConstant(SwizzleRemap[Idx], DL, MVT::i32);
1859   }
1860
1861   return BuildVector;
1862 }
1863
1864
1865 //===----------------------------------------------------------------------===//
1866 // Custom DAG Optimizations
1867 //===----------------------------------------------------------------------===//
1868
1869 SDValue R600TargetLowering::PerformDAGCombine(SDNode *N,
1870                                               DAGCombinerInfo &DCI) const {
1871   SelectionDAG &DAG = DCI.DAG;
1872
1873   switch (N->getOpcode()) {
1874   default: return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
1875   // (f32 fp_round (f64 uint_to_fp a)) -> (f32 uint_to_fp a)
1876   case ISD::FP_ROUND: {
1877       SDValue Arg = N->getOperand(0);
1878       if (Arg.getOpcode() == ISD::UINT_TO_FP && Arg.getValueType() == MVT::f64) {
1879         return DAG.getNode(ISD::UINT_TO_FP, SDLoc(N), N->getValueType(0),
1880                            Arg.getOperand(0));
1881       }
1882       break;
1883     }
1884
1885   // (i32 fp_to_sint (fneg (select_cc f32, f32, 1.0, 0.0 cc))) ->
1886   // (i32 select_cc f32, f32, -1, 0 cc)
1887   //
1888   // Mesa's GLSL frontend generates the above pattern a lot and we can lower
1889   // this to one of the SET*_DX10 instructions.
1890   case ISD::FP_TO_SINT: {
1891     SDValue FNeg = N->getOperand(0);
1892     if (FNeg.getOpcode() != ISD::FNEG) {
1893       return SDValue();
1894     }
1895     SDValue SelectCC = FNeg.getOperand(0);
1896     if (SelectCC.getOpcode() != ISD::SELECT_CC ||
1897         SelectCC.getOperand(0).getValueType() != MVT::f32 || // LHS
1898         SelectCC.getOperand(2).getValueType() != MVT::f32 || // True
1899         !isHWTrueValue(SelectCC.getOperand(2)) ||
1900         !isHWFalseValue(SelectCC.getOperand(3))) {
1901       return SDValue();
1902     }
1903
1904     SDLoc dl(N);
1905     return DAG.getNode(ISD::SELECT_CC, dl, N->getValueType(0),
1906                            SelectCC.getOperand(0), // LHS
1907                            SelectCC.getOperand(1), // RHS
1908                            DAG.getConstant(-1, dl, MVT::i32), // True
1909                            DAG.getConstant(0, dl, MVT::i32),  // False
1910                            SelectCC.getOperand(4)); // CC
1911
1912     break;
1913   }
1914
1915   // insert_vector_elt (build_vector elt0, ... , eltN), NewEltIdx, idx
1916   // => build_vector elt0, ... , NewEltIdx, ... , eltN
1917   case ISD::INSERT_VECTOR_ELT: {
1918     SDValue InVec = N->getOperand(0);
1919     SDValue InVal = N->getOperand(1);
1920     SDValue EltNo = N->getOperand(2);
1921     SDLoc dl(N);
1922
1923     // If the inserted element is an UNDEF, just use the input vector.
1924     if (InVal.getOpcode() == ISD::UNDEF)
1925       return InVec;
1926
1927     EVT VT = InVec.getValueType();
1928
1929     // If we can't generate a legal BUILD_VECTOR, exit
1930     if (!isOperationLegal(ISD::BUILD_VECTOR, VT))
1931       return SDValue();
1932
1933     // Check that we know which element is being inserted
1934     if (!isa<ConstantSDNode>(EltNo))
1935       return SDValue();
1936     unsigned Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
1937
1938     // Check that the operand is a BUILD_VECTOR (or UNDEF, which can essentially
1939     // be converted to a BUILD_VECTOR).  Fill in the Ops vector with the
1940     // vector elements.
1941     SmallVector<SDValue, 8> Ops;
1942     if (InVec.getOpcode() == ISD::BUILD_VECTOR) {
1943       Ops.append(InVec.getNode()->op_begin(),
1944                  InVec.getNode()->op_end());
1945     } else if (InVec.getOpcode() == ISD::UNDEF) {
1946       unsigned NElts = VT.getVectorNumElements();
1947       Ops.append(NElts, DAG.getUNDEF(InVal.getValueType()));
1948     } else {
1949       return SDValue();
1950     }
1951
1952     // Insert the element
1953     if (Elt < Ops.size()) {
1954       // All the operands of BUILD_VECTOR must have the same type;
1955       // we enforce that here.
1956       EVT OpVT = Ops[0].getValueType();
1957       if (InVal.getValueType() != OpVT)
1958         InVal = OpVT.bitsGT(InVal.getValueType()) ?
1959           DAG.getNode(ISD::ANY_EXTEND, dl, OpVT, InVal) :
1960           DAG.getNode(ISD::TRUNCATE, dl, OpVT, InVal);
1961       Ops[Elt] = InVal;
1962     }
1963
1964     // Return the new vector
1965     return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
1966   }
1967
1968   // Extract_vec (Build_vector) generated by custom lowering
1969   // also needs to be customly combined
1970   case ISD::EXTRACT_VECTOR_ELT: {
1971     SDValue Arg = N->getOperand(0);
1972     if (Arg.getOpcode() == ISD::BUILD_VECTOR) {
1973       if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
1974         unsigned Element = Const->getZExtValue();
1975         return Arg->getOperand(Element);
1976       }
1977     }
1978     if (Arg.getOpcode() == ISD::BITCAST &&
1979         Arg.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
1980       if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
1981         unsigned Element = Const->getZExtValue();
1982         return DAG.getNode(ISD::BITCAST, SDLoc(N), N->getVTList(),
1983             Arg->getOperand(0).getOperand(Element));
1984       }
1985     }
1986   }
1987
1988   case ISD::SELECT_CC: {
1989     // Try common optimizations
1990     SDValue Ret = AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
1991     if (Ret.getNode())
1992       return Ret;
1993
1994     // fold selectcc (selectcc x, y, a, b, cc), b, a, b, seteq ->
1995     //      selectcc x, y, a, b, inv(cc)
1996     //
1997     // fold selectcc (selectcc x, y, a, b, cc), b, a, b, setne ->
1998     //      selectcc x, y, a, b, cc
1999     SDValue LHS = N->getOperand(0);
2000     if (LHS.getOpcode() != ISD::SELECT_CC) {
2001       return SDValue();
2002     }
2003
2004     SDValue RHS = N->getOperand(1);
2005     SDValue True = N->getOperand(2);
2006     SDValue False = N->getOperand(3);
2007     ISD::CondCode NCC = cast<CondCodeSDNode>(N->getOperand(4))->get();
2008
2009     if (LHS.getOperand(2).getNode() != True.getNode() ||
2010         LHS.getOperand(3).getNode() != False.getNode() ||
2011         RHS.getNode() != False.getNode()) {
2012       return SDValue();
2013     }
2014
2015     switch (NCC) {
2016     default: return SDValue();
2017     case ISD::SETNE: return LHS;
2018     case ISD::SETEQ: {
2019       ISD::CondCode LHSCC = cast<CondCodeSDNode>(LHS.getOperand(4))->get();
2020       LHSCC = ISD::getSetCCInverse(LHSCC,
2021                                   LHS.getOperand(0).getValueType().isInteger());
2022       if (DCI.isBeforeLegalizeOps() ||
2023           isCondCodeLegal(LHSCC, LHS.getOperand(0).getSimpleValueType()))
2024         return DAG.getSelectCC(SDLoc(N),
2025                                LHS.getOperand(0),
2026                                LHS.getOperand(1),
2027                                LHS.getOperand(2),
2028                                LHS.getOperand(3),
2029                                LHSCC);
2030       break;
2031     }
2032     }
2033     return SDValue();
2034   }
2035
2036   case AMDGPUISD::EXPORT: {
2037     SDValue Arg = N->getOperand(1);
2038     if (Arg.getOpcode() != ISD::BUILD_VECTOR)
2039       break;
2040
2041     SDValue NewArgs[8] = {
2042       N->getOperand(0), // Chain
2043       SDValue(),
2044       N->getOperand(2), // ArrayBase
2045       N->getOperand(3), // Type
2046       N->getOperand(4), // SWZ_X
2047       N->getOperand(5), // SWZ_Y
2048       N->getOperand(6), // SWZ_Z
2049       N->getOperand(7) // SWZ_W
2050     };
2051     SDLoc DL(N);
2052     NewArgs[1] = OptimizeSwizzle(N->getOperand(1), &NewArgs[4], DAG, DL);
2053     return DAG.getNode(AMDGPUISD::EXPORT, DL, N->getVTList(), NewArgs);
2054   }
2055   case AMDGPUISD::TEXTURE_FETCH: {
2056     SDValue Arg = N->getOperand(1);
2057     if (Arg.getOpcode() != ISD::BUILD_VECTOR)
2058       break;
2059
2060     SDValue NewArgs[19] = {
2061       N->getOperand(0),
2062       N->getOperand(1),
2063       N->getOperand(2),
2064       N->getOperand(3),
2065       N->getOperand(4),
2066       N->getOperand(5),
2067       N->getOperand(6),
2068       N->getOperand(7),
2069       N->getOperand(8),
2070       N->getOperand(9),
2071       N->getOperand(10),
2072       N->getOperand(11),
2073       N->getOperand(12),
2074       N->getOperand(13),
2075       N->getOperand(14),
2076       N->getOperand(15),
2077       N->getOperand(16),
2078       N->getOperand(17),
2079       N->getOperand(18),
2080     };
2081     SDLoc DL(N);
2082     NewArgs[1] = OptimizeSwizzle(N->getOperand(1), &NewArgs[2], DAG, DL);
2083     return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, DL, N->getVTList(), NewArgs);
2084   }
2085   }
2086
2087   return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
2088 }
2089
2090 static bool
2091 FoldOperand(SDNode *ParentNode, unsigned SrcIdx, SDValue &Src, SDValue &Neg,
2092             SDValue &Abs, SDValue &Sel, SDValue &Imm, SelectionDAG &DAG) {
2093   const R600InstrInfo *TII =
2094       static_cast<const R600InstrInfo *>(DAG.getSubtarget().getInstrInfo());
2095   if (!Src.isMachineOpcode())
2096     return false;
2097   switch (Src.getMachineOpcode()) {
2098   case AMDGPU::FNEG_R600:
2099     if (!Neg.getNode())
2100       return false;
2101     Src = Src.getOperand(0);
2102     Neg = DAG.getTargetConstant(1, SDLoc(ParentNode), MVT::i32);
2103     return true;
2104   case AMDGPU::FABS_R600:
2105     if (!Abs.getNode())
2106       return false;
2107     Src = Src.getOperand(0);
2108     Abs = DAG.getTargetConstant(1, SDLoc(ParentNode), MVT::i32);
2109     return true;
2110   case AMDGPU::CONST_COPY: {
2111     unsigned Opcode = ParentNode->getMachineOpcode();
2112     bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1;
2113
2114     if (!Sel.getNode())
2115       return false;
2116
2117     SDValue CstOffset = Src.getOperand(0);
2118     if (ParentNode->getValueType(0).isVector())
2119       return false;
2120
2121     // Gather constants values
2122     int SrcIndices[] = {
2123       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0),
2124       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1),
2125       TII->getOperandIdx(Opcode, AMDGPU::OpName::src2),
2126       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_X),
2127       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Y),
2128       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Z),
2129       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_W),
2130       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_X),
2131       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Y),
2132       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Z),
2133       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_W)
2134     };
2135     std::vector<unsigned> Consts;
2136     for (int OtherSrcIdx : SrcIndices) {
2137       int OtherSelIdx = TII->getSelIdx(Opcode, OtherSrcIdx);
2138       if (OtherSrcIdx < 0 || OtherSelIdx < 0)
2139         continue;
2140       if (HasDst) {
2141         OtherSrcIdx--;
2142         OtherSelIdx--;
2143       }
2144       if (RegisterSDNode *Reg =
2145           dyn_cast<RegisterSDNode>(ParentNode->getOperand(OtherSrcIdx))) {
2146         if (Reg->getReg() == AMDGPU::ALU_CONST) {
2147           ConstantSDNode *Cst
2148             = cast<ConstantSDNode>(ParentNode->getOperand(OtherSelIdx));
2149           Consts.push_back(Cst->getZExtValue());
2150         }
2151       }
2152     }
2153
2154     ConstantSDNode *Cst = cast<ConstantSDNode>(CstOffset);
2155     Consts.push_back(Cst->getZExtValue());
2156     if (!TII->fitsConstReadLimitations(Consts)) {
2157       return false;
2158     }
2159
2160     Sel = CstOffset;
2161     Src = DAG.getRegister(AMDGPU::ALU_CONST, MVT::f32);
2162     return true;
2163   }
2164   case AMDGPU::MOV_IMM_I32:
2165   case AMDGPU::MOV_IMM_F32: {
2166     unsigned ImmReg = AMDGPU::ALU_LITERAL_X;
2167     uint64_t ImmValue = 0;
2168
2169
2170     if (Src.getMachineOpcode() == AMDGPU::MOV_IMM_F32) {
2171       ConstantFPSDNode *FPC = dyn_cast<ConstantFPSDNode>(Src.getOperand(0));
2172       float FloatValue = FPC->getValueAPF().convertToFloat();
2173       if (FloatValue == 0.0) {
2174         ImmReg = AMDGPU::ZERO;
2175       } else if (FloatValue == 0.5) {
2176         ImmReg = AMDGPU::HALF;
2177       } else if (FloatValue == 1.0) {
2178         ImmReg = AMDGPU::ONE;
2179       } else {
2180         ImmValue = FPC->getValueAPF().bitcastToAPInt().getZExtValue();
2181       }
2182     } else {
2183       ConstantSDNode *C = dyn_cast<ConstantSDNode>(Src.getOperand(0));
2184       uint64_t Value = C->getZExtValue();
2185       if (Value == 0) {
2186         ImmReg = AMDGPU::ZERO;
2187       } else if (Value == 1) {
2188         ImmReg = AMDGPU::ONE_INT;
2189       } else {
2190         ImmValue = Value;
2191       }
2192     }
2193
2194     // Check that we aren't already using an immediate.
2195     // XXX: It's possible for an instruction to have more than one
2196     // immediate operand, but this is not supported yet.
2197     if (ImmReg == AMDGPU::ALU_LITERAL_X) {
2198       if (!Imm.getNode())
2199         return false;
2200       ConstantSDNode *C = dyn_cast<ConstantSDNode>(Imm);
2201       assert(C);
2202       if (C->getZExtValue())
2203         return false;
2204       Imm = DAG.getTargetConstant(ImmValue, SDLoc(ParentNode), MVT::i32);
2205     }
2206     Src = DAG.getRegister(ImmReg, MVT::i32);
2207     return true;
2208   }
2209   default:
2210     return false;
2211   }
2212 }
2213
2214
2215 /// \brief Fold the instructions after selecting them
2216 SDNode *R600TargetLowering::PostISelFolding(MachineSDNode *Node,
2217                                             SelectionDAG &DAG) const {
2218   const R600InstrInfo *TII =
2219       static_cast<const R600InstrInfo *>(DAG.getSubtarget().getInstrInfo());
2220   if (!Node->isMachineOpcode())
2221     return Node;
2222   unsigned Opcode = Node->getMachineOpcode();
2223   SDValue FakeOp;
2224
2225   std::vector<SDValue> Ops(Node->op_begin(), Node->op_end());
2226
2227   if (Opcode == AMDGPU::DOT_4) {
2228     int OperandIdx[] = {
2229       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_X),
2230       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Y),
2231       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Z),
2232       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_W),
2233       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_X),
2234       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Y),
2235       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Z),
2236       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_W)
2237         };
2238     int NegIdx[] = {
2239       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_X),
2240       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_Y),
2241       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_Z),
2242       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_W),
2243       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_X),
2244       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_Y),
2245       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_Z),
2246       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_W)
2247     };
2248     int AbsIdx[] = {
2249       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_X),
2250       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_Y),
2251       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_Z),
2252       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_W),
2253       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_X),
2254       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_Y),
2255       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_Z),
2256       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_W)
2257     };
2258     for (unsigned i = 0; i < 8; i++) {
2259       if (OperandIdx[i] < 0)
2260         return Node;
2261       SDValue &Src = Ops[OperandIdx[i] - 1];
2262       SDValue &Neg = Ops[NegIdx[i] - 1];
2263       SDValue &Abs = Ops[AbsIdx[i] - 1];
2264       bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1;
2265       int SelIdx = TII->getSelIdx(Opcode, OperandIdx[i]);
2266       if (HasDst)
2267         SelIdx--;
2268       SDValue &Sel = (SelIdx > -1) ? Ops[SelIdx] : FakeOp;
2269       if (FoldOperand(Node, i, Src, Neg, Abs, Sel, FakeOp, DAG))
2270         return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
2271     }
2272   } else if (Opcode == AMDGPU::REG_SEQUENCE) {
2273     for (unsigned i = 1, e = Node->getNumOperands(); i < e; i += 2) {
2274       SDValue &Src = Ops[i];
2275       if (FoldOperand(Node, i, Src, FakeOp, FakeOp, FakeOp, FakeOp, DAG))
2276         return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
2277     }
2278   } else if (Opcode == AMDGPU::CLAMP_R600) {
2279     SDValue Src = Node->getOperand(0);
2280     if (!Src.isMachineOpcode() ||
2281         !TII->hasInstrModifiers(Src.getMachineOpcode()))
2282       return Node;
2283     int ClampIdx = TII->getOperandIdx(Src.getMachineOpcode(),
2284         AMDGPU::OpName::clamp);
2285     if (ClampIdx < 0)
2286       return Node;
2287     SDLoc DL(Node);
2288     std::vector<SDValue> Ops(Src->op_begin(), Src->op_end());
2289     Ops[ClampIdx - 1] = DAG.getTargetConstant(1, DL, MVT::i32);
2290     return DAG.getMachineNode(Src.getMachineOpcode(), DL,
2291                               Node->getVTList(), Ops);
2292   } else {
2293     if (!TII->hasInstrModifiers(Opcode))
2294       return Node;
2295     int OperandIdx[] = {
2296       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0),
2297       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1),
2298       TII->getOperandIdx(Opcode, AMDGPU::OpName::src2)
2299     };
2300     int NegIdx[] = {
2301       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg),
2302       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg),
2303       TII->getOperandIdx(Opcode, AMDGPU::OpName::src2_neg)
2304     };
2305     int AbsIdx[] = {
2306       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs),
2307       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs),
2308       -1
2309     };
2310     for (unsigned i = 0; i < 3; i++) {
2311       if (OperandIdx[i] < 0)
2312         return Node;
2313       SDValue &Src = Ops[OperandIdx[i] - 1];
2314       SDValue &Neg = Ops[NegIdx[i] - 1];
2315       SDValue FakeAbs;
2316       SDValue &Abs = (AbsIdx[i] > -1) ? Ops[AbsIdx[i] - 1] : FakeAbs;
2317       bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1;
2318       int SelIdx = TII->getSelIdx(Opcode, OperandIdx[i]);
2319       int ImmIdx = TII->getOperandIdx(Opcode, AMDGPU::OpName::literal);
2320       if (HasDst) {
2321         SelIdx--;
2322         ImmIdx--;
2323       }
2324       SDValue &Sel = (SelIdx > -1) ? Ops[SelIdx] : FakeOp;
2325       SDValue &Imm = Ops[ImmIdx];
2326       if (FoldOperand(Node, i, Src, Neg, Abs, Sel, Imm, DAG))
2327         return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
2328     }
2329   }
2330
2331   return Node;
2332 }