lib/Target/R600/R600ISelLowering.cpp

   1 //===-- R600ISelLowering.cpp - R600 DAG Lowering Implementation -----------===//
   2 //
   3 //                     The LLVM Compiler Infrastructure
   4 //
   5 // This file is distributed under the University of Illinois Open Source
   6 // License. See LICENSE.TXT for details.
   7 //
   8 //===----------------------------------------------------------------------===//
   9 //
  10 /// \file
  11 /// \brief Custom DAG lowering for R600
  12 //
  13 //===----------------------------------------------------------------------===//
  14
  15 #include "R600ISelLowering.h"
  16 #include "AMDGPUFrameLowering.h"
  17 #include "AMDGPUIntrinsicInfo.h"
  18 #include "AMDGPUSubtarget.h"
  19 #include "R600Defines.h"
  20 #include "R600InstrInfo.h"
  21 #include "R600MachineFunctionInfo.h"
  22 #include "llvm/Analysis/ValueTracking.h"
  23 #include "llvm/CodeGen/CallingConvLower.h"
  24 #include "llvm/CodeGen/MachineFrameInfo.h"
  25 #include "llvm/CodeGen/MachineInstrBuilder.h"
  26 #include "llvm/CodeGen/MachineRegisterInfo.h"
  27 #include "llvm/CodeGen/SelectionDAG.h"
  28 #include "llvm/IR/Argument.h"
  29 #include "llvm/IR/Function.h"
  30
  31 using namespace llvm;
  32
  33 R600TargetLowering::R600TargetLowering(TargetMachine &TM,
  34                                        const AMDGPUSubtarget &STI)
  35     : AMDGPUTargetLowering(TM, STI), Gen(STI.getGeneration()) {
  36   addRegisterClass(MVT::v4f32, &AMDGPU::R600_Reg128RegClass);
  37   addRegisterClass(MVT::f32, &AMDGPU::R600_Reg32RegClass);
  38   addRegisterClass(MVT::v4i32, &AMDGPU::R600_Reg128RegClass);
  39   addRegisterClass(MVT::i32, &AMDGPU::R600_Reg32RegClass);
  40   addRegisterClass(MVT::v2f32, &AMDGPU::R600_Reg64RegClass);
  41   addRegisterClass(MVT::v2i32, &AMDGPU::R600_Reg64RegClass);
  42
  43   computeRegisterProperties(STI.getRegisterInfo());
  44
  45   // Set condition code actions
  46   setCondCodeAction(ISD::SETO,   MVT::f32, Expand);
  47   setCondCodeAction(ISD::SETUO,  MVT::f32, Expand);
  48   setCondCodeAction(ISD::SETLT,  MVT::f32, Expand);
  49   setCondCodeAction(ISD::SETLE,  MVT::f32, Expand);
  50   setCondCodeAction(ISD::SETOLT, MVT::f32, Expand);
  51   setCondCodeAction(ISD::SETOLE, MVT::f32, Expand);
  52   setCondCodeAction(ISD::SETONE, MVT::f32, Expand);
  53   setCondCodeAction(ISD::SETUEQ, MVT::f32, Expand);
  54   setCondCodeAction(ISD::SETUGE, MVT::f32, Expand);
  55   setCondCodeAction(ISD::SETUGT, MVT::f32, Expand);
  56   setCondCodeAction(ISD::SETULT, MVT::f32, Expand);
  57   setCondCodeAction(ISD::SETULE, MVT::f32, Expand);
  58
  59   setCondCodeAction(ISD::SETLE, MVT::i32, Expand);
  60   setCondCodeAction(ISD::SETLT, MVT::i32, Expand);
  61   setCondCodeAction(ISD::SETULE, MVT::i32, Expand);
  62   setCondCodeAction(ISD::SETULT, MVT::i32, Expand);
  63
  64   setOperationAction(ISD::FCOS, MVT::f32, Custom);
  65   setOperationAction(ISD::FSIN, MVT::f32, Custom);
  66
  67   setOperationAction(ISD::SETCC, MVT::v4i32, Expand);
  68   setOperationAction(ISD::SETCC, MVT::v2i32, Expand);
  69
  70   setOperationAction(ISD::BR_CC, MVT::i32, Expand);
  71   setOperationAction(ISD::BR_CC, MVT::f32, Expand);
  72   setOperationAction(ISD::BRCOND, MVT::Other, Custom);
  73
  74   setOperationAction(ISD::FSUB, MVT::f32, Expand);
  75
  76   setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
  77   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
  78   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i1, Custom);
  79
  80   setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
  81   setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
  82
  83   setOperationAction(ISD::SETCC, MVT::i32, Expand);
  84   setOperationAction(ISD::SETCC, MVT::f32, Expand);
  85   setOperationAction(ISD::FP_TO_UINT, MVT::i1, Custom);
  86   setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
  87   setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);
  88
  89   setOperationAction(ISD::SELECT, MVT::i32, Expand);
  90   setOperationAction(ISD::SELECT, MVT::f32, Expand);
  91   setOperationAction(ISD::SELECT, MVT::v2i32, Expand);
  92   setOperationAction(ISD::SELECT, MVT::v4i32, Expand);
  93
  94   // ADD, SUB overflow. These need to be Custom because
  95   // SelectionDAGLegalize::LegalizeOp (LegalizeDAG.cpp)
  96   // turns Legal into expand
  97   if (Subtarget->hasCARRY())
  98     setOperationAction(ISD::UADDO, MVT::i32, Custom);
  99
 100   if (Subtarget->hasBORROW())
 101     setOperationAction(ISD::USUBO, MVT::i32, Custom);
 102
 103   // Expand sign extension of vectors
 104   if (!Subtarget->hasBFE())
 105     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
 106
 107   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i1, Expand);
 108   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i1, Expand);
 109
 110   if (!Subtarget->hasBFE())
 111     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8, Expand);
 112   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i8, Expand);
 113   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i8, Expand);
 114
 115   if (!Subtarget->hasBFE())
 116     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Expand);
 117   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Expand);
 118   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Expand);
 119
 120   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
 121   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i32, Expand);
 122   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i32, Expand);
 123
 124   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::Other, Expand);
 125
 126
 127   // Legalize loads and stores to the private address space.
 128   setOperationAction(ISD::LOAD, MVT::i32, Custom);
 129   setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
 130   setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
 131
 132   // EXTLOAD should be the same as ZEXTLOAD. It is legal for some address
 133   // spaces, so it is custom lowered to handle those where it isn't.
 134   for (MVT VT : MVT::integer_valuetypes()) {
 135     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
 136     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i8, Custom);
 137     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i16, Custom);
 138
 139     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote);
 140     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i8, Custom);
 141     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i16, Custom);
 142
 143     setLoadExtAction(ISD::EXTLOAD, VT, MVT::i1, Promote);
 144     setLoadExtAction(ISD::EXTLOAD, VT, MVT::i8, Custom);
 145     setLoadExtAction(ISD::EXTLOAD, VT, MVT::i16, Custom);
 146   }
 147
 148   setOperationAction(ISD::STORE, MVT::i8, Custom);
 149   setOperationAction(ISD::STORE, MVT::i32, Custom);
 150   setOperationAction(ISD::STORE, MVT::v2i32, Custom);
 151   setOperationAction(ISD::STORE, MVT::v4i32, Custom);
 152   setTruncStoreAction(MVT::i32, MVT::i8, Custom);
 153   setTruncStoreAction(MVT::i32, MVT::i16, Custom);
 154
 155   setOperationAction(ISD::LOAD, MVT::i32, Custom);
 156   setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
 157   setOperationAction(ISD::FrameIndex, MVT::i32, Custom);
 158
 159   setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i32, Custom);
 160   setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f32, Custom);
 161   setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom);
 162   setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
 163
 164   setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i32, Custom);
 165   setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f32, Custom);
 166   setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom);
 167   setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom);
 168
 169   setTargetDAGCombine(ISD::FP_ROUND);
 170   setTargetDAGCombine(ISD::FP_TO_SINT);
 171   setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
 172   setTargetDAGCombine(ISD::SELECT_CC);
 173   setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
 174
 175   // These should be replaced by UDVIREM, but it does not happen automatically
 176   // during Type Legalization
 177   setOperationAction(ISD::UDIV, MVT::i64, Custom);
 178   setOperationAction(ISD::UREM, MVT::i64, Custom);
 179   setOperationAction(ISD::SDIV, MVT::i64, Custom);
 180   setOperationAction(ISD::SREM, MVT::i64, Custom);
 181
 182   // We don't have 64-bit shifts. Thus we need either SHX i64 or SHX_PARTS i32
 183   //  to be Legal/Custom in order to avoid library calls.
 184   setOperationAction(ISD::SHL_PARTS, MVT::i32, Custom);
 185   setOperationAction(ISD::SRL_PARTS, MVT::i32, Custom);
 186   setOperationAction(ISD::SRA_PARTS, MVT::i32, Custom);
 187
 188   setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
 189
 190   const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 };
 191   for (MVT VT : ScalarIntVTs) {
 192     setOperationAction(ISD::ADDC, VT, Expand);
 193     setOperationAction(ISD::SUBC, VT, Expand);
 194     setOperationAction(ISD::ADDE, VT, Expand);
 195     setOperationAction(ISD::SUBE, VT, Expand);
 196   }
 197
 198   setSchedulingPreference(Sched::Source);
 199 }
 200
 201 MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter(
 202     MachineInstr * MI, MachineBasicBlock * BB) const {
 203   MachineFunction * MF = BB->getParent();
 204   MachineRegisterInfo &MRI = MF->getRegInfo();
 205   MachineBasicBlock::iterator I = *MI;
 206   const R600InstrInfo *TII =
 207       static_cast<const R600InstrInfo *>(Subtarget->getInstrInfo());
 208
 209   switch (MI->getOpcode()) {
 210   default:
 211     // Replace LDS_*_RET instruction that don't have any uses with the
 212     // equivalent LDS_*_NORET instruction.
 213     if (TII->isLDSRetInstr(MI->getOpcode())) {
 214       int DstIdx = TII->getOperandIdx(MI->getOpcode(), AMDGPU::OpName::dst);
 215       assert(DstIdx != -1);
 216       MachineInstrBuilder NewMI;
 217       // FIXME: getLDSNoRetOp method only handles LDS_1A1D LDS ops. Add
 218       //        LDS_1A2D support and remove this special case.
 219       if (!MRI.use_empty(MI->getOperand(DstIdx).getReg()) ||
 220            MI->getOpcode() == AMDGPU::LDS_CMPST_RET)
 221         return BB;
 222
 223       NewMI = BuildMI(*BB, I, BB->findDebugLoc(I),
 224                       TII->get(AMDGPU::getLDSNoRetOp(MI->getOpcode())));
 225       for (unsigned i = 1, e = MI->getNumOperands(); i < e; ++i) {
 226         NewMI.addOperand(MI->getOperand(i));
 227       }
 228     } else {
 229       return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
 230     }
 231     break;
 232   case AMDGPU::CLAMP_R600: {
 233     MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
 234                                                    AMDGPU::MOV,
 235                                                    MI->getOperand(0).getReg(),
 236                                                    MI->getOperand(1).getReg());
 237     TII->addFlag(NewMI, 0, MO_FLAG_CLAMP);
 238     break;
 239   }
 240
 241   case AMDGPU::FABS_R600: {
 242     MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
 243                                                     AMDGPU::MOV,
 244                                                     MI->getOperand(0).getReg(),
 245                                                     MI->getOperand(1).getReg());
 246     TII->addFlag(NewMI, 0, MO_FLAG_ABS);
 247     break;
 248   }
 249
 250   case AMDGPU::FNEG_R600: {
 251     MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
 252                                                     AMDGPU::MOV,
 253                                                     MI->getOperand(0).getReg(),
 254                                                     MI->getOperand(1).getReg());
 255     TII->addFlag(NewMI, 0, MO_FLAG_NEG);
 256     break;
 257   }
 258
 259   case AMDGPU::MASK_WRITE: {
 260     unsigned maskedRegister = MI->getOperand(0).getReg();
 261     assert(TargetRegisterInfo::isVirtualRegister(maskedRegister));
 262     MachineInstr * defInstr = MRI.getVRegDef(maskedRegister);
 263     TII->addFlag(defInstr, 0, MO_FLAG_MASK);
 264     break;
 265   }
 266
 267   case AMDGPU::MOV_IMM_F32:
 268     TII->buildMovImm(*BB, I, MI->getOperand(0).getReg(),
 269                      MI->getOperand(1).getFPImm()->getValueAPF()
 270                          .bitcastToAPInt().getZExtValue());
 271     break;
 272   case AMDGPU::MOV_IMM_I32:
 273     TII->buildMovImm(*BB, I, MI->getOperand(0).getReg(),
 274                      MI->getOperand(1).getImm());
 275     break;
 276   case AMDGPU::CONST_COPY: {
 277     MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, MI, AMDGPU::MOV,
 278         MI->getOperand(0).getReg(), AMDGPU::ALU_CONST);
 279     TII->setImmOperand(NewMI, AMDGPU::OpName::src0_sel,
 280         MI->getOperand(1).getImm());
 281     break;
 282   }
 283
 284   case AMDGPU::RAT_WRITE_CACHELESS_32_eg:
 285   case AMDGPU::RAT_WRITE_CACHELESS_64_eg:
 286   case AMDGPU::RAT_WRITE_CACHELESS_128_eg: {
 287     unsigned EOP = (std::next(I)->getOpcode() == AMDGPU::RETURN) ? 1 : 0;
 288
 289     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode()))
 290             .addOperand(MI->getOperand(0))
 291             .addOperand(MI->getOperand(1))
 292             .addImm(EOP); // Set End of program bit
 293     break;
 294   }
 295
 296   case AMDGPU::TXD: {
 297     unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
 298     unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
 299     MachineOperand &RID = MI->getOperand(4);
 300     MachineOperand &SID = MI->getOperand(5);
 301     unsigned TextureId = MI->getOperand(6).getImm();
 302     unsigned SrcX = 0, SrcY = 1, SrcZ = 2, SrcW = 3;
 303     unsigned CTX = 1, CTY = 1, CTZ = 1, CTW = 1;
 304
 305     switch (TextureId) {
 306     case 5: // Rect
 307       CTX = CTY = 0;
 308       break;
 309     case 6: // Shadow1D
 310       SrcW = SrcZ;
 311       break;
 312     case 7: // Shadow2D
 313       SrcW = SrcZ;
 314       break;
 315     case 8: // ShadowRect
 316       CTX = CTY = 0;
 317       SrcW = SrcZ;
 318       break;
 319     case 9: // 1DArray
 320       SrcZ = SrcY;
 321       CTZ = 0;
 322       break;
 323     case 10: // 2DArray
 324       CTZ = 0;
 325       break;
 326     case 11: // Shadow1DArray
 327       SrcZ = SrcY;
 328       CTZ = 0;
 329       break;
 330     case 12: // Shadow2DArray
 331       CTZ = 0;
 332       break;
 333     }
 334     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), T0)
 335             .addOperand(MI->getOperand(3))
 336             .addImm(SrcX)
 337             .addImm(SrcY)
 338             .addImm(SrcZ)
 339             .addImm(SrcW)
 340             .addImm(0)
 341             .addImm(0)
 342             .addImm(0)
 343             .addImm(0)
 344             .addImm(1)
 345             .addImm(2)
 346             .addImm(3)
 347             .addOperand(RID)
 348             .addOperand(SID)
 349             .addImm(CTX)
 350             .addImm(CTY)
 351             .addImm(CTZ)
 352             .addImm(CTW);
 353     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), T1)
 354             .addOperand(MI->getOperand(2))
 355             .addImm(SrcX)
 356             .addImm(SrcY)
 357             .addImm(SrcZ)
 358             .addImm(SrcW)
 359             .addImm(0)
 360             .addImm(0)
 361             .addImm(0)
 362             .addImm(0)
 363             .addImm(1)
 364             .addImm(2)
 365             .addImm(3)
 366             .addOperand(RID)
 367             .addOperand(SID)
 368             .addImm(CTX)
 369             .addImm(CTY)
 370             .addImm(CTZ)
 371             .addImm(CTW);
 372     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_G))
 373             .addOperand(MI->getOperand(0))
 374             .addOperand(MI->getOperand(1))
 375             .addImm(SrcX)
 376             .addImm(SrcY)
 377             .addImm(SrcZ)
 378             .addImm(SrcW)
 379             .addImm(0)
 380             .addImm(0)
 381             .addImm(0)
 382             .addImm(0)
 383             .addImm(1)
 384             .addImm(2)
 385             .addImm(3)
 386             .addOperand(RID)
 387             .addOperand(SID)
 388             .addImm(CTX)
 389             .addImm(CTY)
 390             .addImm(CTZ)
 391             .addImm(CTW)
 392             .addReg(T0, RegState::Implicit)
 393             .addReg(T1, RegState::Implicit);
 394     break;
 395   }
 396
 397   case AMDGPU::TXD_SHADOW: {
 398     unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
 399     unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
 400     MachineOperand &RID = MI->getOperand(4);
 401     MachineOperand &SID = MI->getOperand(5);
 402     unsigned TextureId = MI->getOperand(6).getImm();
 403     unsigned SrcX = 0, SrcY = 1, SrcZ = 2, SrcW = 3;
 404     unsigned CTX = 1, CTY = 1, CTZ = 1, CTW = 1;
 405
 406     switch (TextureId) {
 407     case 5: // Rect
 408       CTX = CTY = 0;
 409       break;
 410     case 6: // Shadow1D
 411       SrcW = SrcZ;
 412       break;
 413     case 7: // Shadow2D
 414       SrcW = SrcZ;
 415       break;
 416     case 8: // ShadowRect
 417       CTX = CTY = 0;
 418       SrcW = SrcZ;
 419       break;
 420     case 9: // 1DArray
 421       SrcZ = SrcY;
 422       CTZ = 0;
 423       break;
 424     case 10: // 2DArray
 425       CTZ = 0;
 426       break;
 427     case 11: // Shadow1DArray
 428       SrcZ = SrcY;
 429       CTZ = 0;
 430       break;
 431     case 12: // Shadow2DArray
 432       CTZ = 0;
 433       break;
 434     }
 435
 436     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), T0)
 437             .addOperand(MI->getOperand(3))
 438             .addImm(SrcX)
 439             .addImm(SrcY)
 440             .addImm(SrcZ)
 441             .addImm(SrcW)
 442             .addImm(0)
 443             .addImm(0)
 444             .addImm(0)
 445             .addImm(0)
 446             .addImm(1)
 447             .addImm(2)
 448             .addImm(3)
 449             .addOperand(RID)
 450             .addOperand(SID)
 451             .addImm(CTX)
 452             .addImm(CTY)
 453             .addImm(CTZ)
 454             .addImm(CTW);
 455     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), T1)
 456             .addOperand(MI->getOperand(2))
 457             .addImm(SrcX)
 458             .addImm(SrcY)
 459             .addImm(SrcZ)
 460             .addImm(SrcW)
 461             .addImm(0)
 462             .addImm(0)
 463             .addImm(0)
 464             .addImm(0)
 465             .addImm(1)
 466             .addImm(2)
 467             .addImm(3)
 468             .addOperand(RID)
 469             .addOperand(SID)
 470             .addImm(CTX)
 471             .addImm(CTY)
 472             .addImm(CTZ)
 473             .addImm(CTW);
 474     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_C_G))
 475             .addOperand(MI->getOperand(0))
 476             .addOperand(MI->getOperand(1))
 477             .addImm(SrcX)
 478             .addImm(SrcY)
 479             .addImm(SrcZ)
 480             .addImm(SrcW)
 481             .addImm(0)
 482             .addImm(0)
 483             .addImm(0)
 484             .addImm(0)
 485             .addImm(1)
 486             .addImm(2)
 487             .addImm(3)
 488             .addOperand(RID)
 489             .addOperand(SID)
 490             .addImm(CTX)
 491             .addImm(CTY)
 492             .addImm(CTZ)
 493             .addImm(CTW)
 494             .addReg(T0, RegState::Implicit)
 495             .addReg(T1, RegState::Implicit);
 496     break;
 497   }
 498
 499   case AMDGPU::BRANCH:
 500       BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP))
 501               .addOperand(MI->getOperand(0));
 502       break;
 503
 504   case AMDGPU::BRANCH_COND_f32: {
 505     MachineInstr *NewMI =
 506       BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X),
 507               AMDGPU::PREDICATE_BIT)
 508               .addOperand(MI->getOperand(1))
 509               .addImm(OPCODE_IS_NOT_ZERO)
 510               .addImm(0); // Flags
 511     TII->addFlag(NewMI, 0, MO_FLAG_PUSH);
 512     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND))
 513             .addOperand(MI->getOperand(0))
 514             .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
 515     break;
 516   }
 517
 518   case AMDGPU::BRANCH_COND_i32: {
 519     MachineInstr *NewMI =
 520       BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X),
 521             AMDGPU::PREDICATE_BIT)
 522             .addOperand(MI->getOperand(1))
 523             .addImm(OPCODE_IS_NOT_ZERO_INT)
 524             .addImm(0); // Flags
 525     TII->addFlag(NewMI, 0, MO_FLAG_PUSH);
 526     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND))
 527            .addOperand(MI->getOperand(0))
 528             .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
 529     break;
 530   }
 531
 532   case AMDGPU::EG_ExportSwz:
 533   case AMDGPU::R600_ExportSwz: {
 534     // Instruction is left unmodified if its not the last one of its type
 535     bool isLastInstructionOfItsType = true;
 536     unsigned InstExportType = MI->getOperand(1).getImm();
 537     for (MachineBasicBlock::iterator NextExportInst = std::next(I),
 538          EndBlock = BB->end(); NextExportInst != EndBlock;
 539          NextExportInst = std::next(NextExportInst)) {
 540       if (NextExportInst->getOpcode() == AMDGPU::EG_ExportSwz ||
 541           NextExportInst->getOpcode() == AMDGPU::R600_ExportSwz) {
 542         unsigned CurrentInstExportType = NextExportInst->getOperand(1)
 543             .getImm();
 544         if (CurrentInstExportType == InstExportType) {
 545           isLastInstructionOfItsType = false;
 546           break;
 547         }
 548       }
 549     }
 550     bool EOP = (std::next(I)->getOpcode() == AMDGPU::RETURN) ? 1 : 0;
 551     if (!EOP && !isLastInstructionOfItsType)
 552       return BB;
 553     unsigned CfInst = (MI->getOpcode() == AMDGPU::EG_ExportSwz)? 84 : 40;
 554     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode()))
 555             .addOperand(MI->getOperand(0))
 556             .addOperand(MI->getOperand(1))
 557             .addOperand(MI->getOperand(2))
 558             .addOperand(MI->getOperand(3))
 559             .addOperand(MI->getOperand(4))
 560             .addOperand(MI->getOperand(5))
 561             .addOperand(MI->getOperand(6))
 562             .addImm(CfInst)
 563             .addImm(EOP);
 564     break;
 565   }
 566   case AMDGPU::RETURN: {
 567     // RETURN instructions must have the live-out registers as implicit uses,
 568     // otherwise they appear dead.
 569     R600MachineFunctionInfo *MFI = MF->getInfo<R600MachineFunctionInfo>();
 570     MachineInstrBuilder MIB(*MF, MI);
 571     for (unsigned i = 0, e = MFI->LiveOuts.size(); i != e; ++i)
 572       MIB.addReg(MFI->LiveOuts[i], RegState::Implicit);
 573     return BB;
 574   }
 575   }
 576
 577   MI->eraseFromParent();
 578   return BB;
 579 }
 580
 581 //===----------------------------------------------------------------------===//
 582 // Custom DAG Lowering Operations
 583 //===----------------------------------------------------------------------===//
 584
 585 SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
 586   MachineFunction &MF = DAG.getMachineFunction();
 587   R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
 588   switch (Op.getOpcode()) {
 589   default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
 590   case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
 591   case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
 592   case ISD::SHL_PARTS: return LowerSHLParts(Op, DAG);
 593   case ISD::SRA_PARTS:
 594   case ISD::SRL_PARTS: return LowerSRXParts(Op, DAG);
 595   case ISD::UADDO: return LowerUADDSUBO(Op, DAG, ISD::ADD, AMDGPUISD::CARRY);
 596   case ISD::USUBO: return LowerUADDSUBO(Op, DAG, ISD::SUB, AMDGPUISD::BORROW);
 597   case ISD::FCOS:
 598   case ISD::FSIN: return LowerTrig(Op, DAG);
 599   case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
 600   case ISD::STORE: return LowerSTORE(Op, DAG);
 601   case ISD::LOAD: {
 602     SDValue Result = LowerLOAD(Op, DAG);
 603     assert((!Result.getNode() ||
 604             Result.getNode()->getNumValues() == 2) &&
 605            "Load should return a value and a chain");
 606     return Result;
 607   }
 608
 609   case ISD::BRCOND: return LowerBRCOND(Op, DAG);
 610   case ISD::GlobalAddress: return LowerGlobalAddress(MFI, Op, DAG);
 611   case ISD::INTRINSIC_VOID: {
 612     SDValue Chain = Op.getOperand(0);
 613     unsigned IntrinsicID =
 614                          cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
 615     switch (IntrinsicID) {
 616     case AMDGPUIntrinsic::AMDGPU_store_output: {
 617       int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
 618       unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister(RegIndex);
 619       MFI->LiveOuts.push_back(Reg);
 620       return DAG.getCopyToReg(Chain, SDLoc(Op), Reg, Op.getOperand(2));
 621     }
 622     case AMDGPUIntrinsic::R600_store_swizzle: {
 623       const SDValue Args[8] = {
 624         Chain,
 625         Op.getOperand(2), // Export Value
 626         Op.getOperand(3), // ArrayBase
 627         Op.getOperand(4), // Type
 628         DAG.getConstant(0, MVT::i32), // SWZ_X
 629         DAG.getConstant(1, MVT::i32), // SWZ_Y
 630         DAG.getConstant(2, MVT::i32), // SWZ_Z
 631         DAG.getConstant(3, MVT::i32) // SWZ_W
 632       };
 633       return DAG.getNode(AMDGPUISD::EXPORT, SDLoc(Op), Op.getValueType(), Args);
 634     }
 635
 636     // default for switch(IntrinsicID)
 637     default: break;
 638     }
 639     // break out of case ISD::INTRINSIC_VOID in switch(Op.getOpcode())
 640     break;
 641   }
 642   case ISD::INTRINSIC_WO_CHAIN: {
 643     unsigned IntrinsicID =
 644                          cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
 645     EVT VT = Op.getValueType();
 646     SDLoc DL(Op);
 647     switch(IntrinsicID) {
 648     default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
 649     case AMDGPUIntrinsic::R600_load_input: {
 650       int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
 651       unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister(RegIndex);
 652       MachineFunction &MF = DAG.getMachineFunction();
 653       MachineRegisterInfo &MRI = MF.getRegInfo();
 654       MRI.addLiveIn(Reg);
 655       return DAG.getCopyFromReg(DAG.getEntryNode(),
 656           SDLoc(DAG.getEntryNode()), Reg, VT);
 657     }
 658
 659     case AMDGPUIntrinsic::R600_interp_input: {
 660       int slot = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
 661       int ijb = cast<ConstantSDNode>(Op.getOperand(2))->getSExtValue();
 662       MachineSDNode *interp;
 663       if (ijb < 0) {
 664         const R600InstrInfo *TII =
 665             static_cast<const R600InstrInfo *>(Subtarget->getInstrInfo());
 666         interp = DAG.getMachineNode(AMDGPU::INTERP_VEC_LOAD, DL,
 667             MVT::v4f32, DAG.getTargetConstant(slot / 4 , MVT::i32));
 668         return DAG.getTargetExtractSubreg(
 669             TII->getRegisterInfo().getSubRegFromChannel(slot % 4),
 670             DL, MVT::f32, SDValue(interp, 0));
 671       }
 672       MachineFunction &MF = DAG.getMachineFunction();
 673       MachineRegisterInfo &MRI = MF.getRegInfo();
 674       unsigned RegisterI = AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb);
 675       unsigned RegisterJ = AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb + 1);
 676       MRI.addLiveIn(RegisterI);
 677       MRI.addLiveIn(RegisterJ);
 678       SDValue RegisterINode = DAG.getCopyFromReg(DAG.getEntryNode(),
 679           SDLoc(DAG.getEntryNode()), RegisterI, MVT::f32);
 680       SDValue RegisterJNode = DAG.getCopyFromReg(DAG.getEntryNode(),
 681           SDLoc(DAG.getEntryNode()), RegisterJ, MVT::f32);
 682
 683       if (slot % 4 < 2)
 684         interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_XY, DL,
 685             MVT::f32, MVT::f32, DAG.getTargetConstant(slot / 4 , MVT::i32),
 686             RegisterJNode, RegisterINode);
 687       else
 688         interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_ZW, DL,
 689             MVT::f32, MVT::f32, DAG.getTargetConstant(slot / 4 , MVT::i32),
 690             RegisterJNode, RegisterINode);
 691       return SDValue(interp, slot % 2);
 692     }
 693     case AMDGPUIntrinsic::R600_interp_xy:
 694     case AMDGPUIntrinsic::R600_interp_zw: {
 695       int slot = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
 696       MachineSDNode *interp;
 697       SDValue RegisterINode = Op.getOperand(2);
 698       SDValue RegisterJNode = Op.getOperand(3);
 699
 700       if (IntrinsicID == AMDGPUIntrinsic::R600_interp_xy)
 701         interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_XY, DL,
 702             MVT::f32, MVT::f32, DAG.getTargetConstant(slot, MVT::i32),
 703             RegisterJNode, RegisterINode);
 704       else
 705         interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_ZW, DL,
 706             MVT::f32, MVT::f32, DAG.getTargetConstant(slot, MVT::i32),
 707             RegisterJNode, RegisterINode);
 708       return DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2f32,
 709           SDValue(interp, 0), SDValue(interp, 1));
 710     }
 711     case AMDGPUIntrinsic::R600_tex:
 712     case AMDGPUIntrinsic::R600_texc:
 713     case AMDGPUIntrinsic::R600_txl:
 714     case AMDGPUIntrinsic::R600_txlc:
 715     case AMDGPUIntrinsic::R600_txb:
 716     case AMDGPUIntrinsic::R600_txbc:
 717     case AMDGPUIntrinsic::R600_txf:
 718     case AMDGPUIntrinsic::R600_txq:
 719     case AMDGPUIntrinsic::R600_ddx:
 720     case AMDGPUIntrinsic::R600_ddy:
 721     case AMDGPUIntrinsic::R600_ldptr: {
 722       unsigned TextureOp;
 723       switch (IntrinsicID) {
 724       case AMDGPUIntrinsic::R600_tex:
 725         TextureOp = 0;
 726         break;
 727       case AMDGPUIntrinsic::R600_texc:
 728         TextureOp = 1;
 729         break;
 730       case AMDGPUIntrinsic::R600_txl:
 731         TextureOp = 2;
 732         break;
 733       case AMDGPUIntrinsic::R600_txlc:
 734         TextureOp = 3;
 735         break;
 736       case AMDGPUIntrinsic::R600_txb:
 737         TextureOp = 4;
 738         break;
 739       case AMDGPUIntrinsic::R600_txbc:
 740         TextureOp = 5;
 741         break;
 742       case AMDGPUIntrinsic::R600_txf:
 743         TextureOp = 6;
 744         break;
 745       case AMDGPUIntrinsic::R600_txq:
 746         TextureOp = 7;
 747         break;
 748       case AMDGPUIntrinsic::R600_ddx:
 749         TextureOp = 8;
 750         break;
 751       case AMDGPUIntrinsic::R600_ddy:
 752         TextureOp = 9;
 753         break;
 754       case AMDGPUIntrinsic::R600_ldptr:
 755         TextureOp = 10;
 756         break;
 757       default:
 758         llvm_unreachable("Unknow Texture Operation");
 759       }
 760
 761       SDValue TexArgs[19] = {
 762         DAG.getConstant(TextureOp, MVT::i32),
 763         Op.getOperand(1),
 764         DAG.getConstant(0, MVT::i32),
 765         DAG.getConstant(1, MVT::i32),
 766         DAG.getConstant(2, MVT::i32),
 767         DAG.getConstant(3, MVT::i32),
 768         Op.getOperand(2),
 769         Op.getOperand(3),
 770         Op.getOperand(4),
 771         DAG.getConstant(0, MVT::i32),
 772         DAG.getConstant(1, MVT::i32),
 773         DAG.getConstant(2, MVT::i32),
 774         DAG.getConstant(3, MVT::i32),
 775         Op.getOperand(5),
 776         Op.getOperand(6),
 777         Op.getOperand(7),
 778         Op.getOperand(8),
 779         Op.getOperand(9),
 780         Op.getOperand(10)
 781       };
 782       return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, DL, MVT::v4f32, TexArgs);
 783     }
 784     case AMDGPUIntrinsic::AMDGPU_dp4: {
 785       SDValue Args[8] = {
 786       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
 787           DAG.getConstant(0, MVT::i32)),
 788       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
 789           DAG.getConstant(0, MVT::i32)),
 790       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
 791           DAG.getConstant(1, MVT::i32)),
 792       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
 793           DAG.getConstant(1, MVT::i32)),
 794       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
 795           DAG.getConstant(2, MVT::i32)),
 796       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
 797           DAG.getConstant(2, MVT::i32)),
 798       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
 799           DAG.getConstant(3, MVT::i32)),
 800       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
 801           DAG.getConstant(3, MVT::i32))
 802       };
 803       return DAG.getNode(AMDGPUISD::DOT4, DL, MVT::f32, Args);
 804     }
 805
 806     case Intrinsic::r600_read_ngroups_x:
 807       return LowerImplicitParameter(DAG, VT, DL, 0);
 808     case Intrinsic::r600_read_ngroups_y:
 809       return LowerImplicitParameter(DAG, VT, DL, 1);
 810     case Intrinsic::r600_read_ngroups_z:
 811       return LowerImplicitParameter(DAG, VT, DL, 2);
 812     case Intrinsic::r600_read_global_size_x:
 813       return LowerImplicitParameter(DAG, VT, DL, 3);
 814     case Intrinsic::r600_read_global_size_y:
 815       return LowerImplicitParameter(DAG, VT, DL, 4);
 816     case Intrinsic::r600_read_global_size_z:
 817       return LowerImplicitParameter(DAG, VT, DL, 5);
 818     case Intrinsic::r600_read_local_size_x:
 819       return LowerImplicitParameter(DAG, VT, DL, 6);
 820     case Intrinsic::r600_read_local_size_y:
 821       return LowerImplicitParameter(DAG, VT, DL, 7);
 822     case Intrinsic::r600_read_local_size_z:
 823       return LowerImplicitParameter(DAG, VT, DL, 8);
 824
 825     case Intrinsic::AMDGPU_read_workdim:
 826       return LowerImplicitParameter(DAG, VT, DL, MFI->ABIArgOffset / 4);
 827
 828     case Intrinsic::r600_read_tgid_x:
 829       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 830                                   AMDGPU::T1_X, VT);
 831     case Intrinsic::r600_read_tgid_y:
 832       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 833                                   AMDGPU::T1_Y, VT);
 834     case Intrinsic::r600_read_tgid_z:
 835       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 836                                   AMDGPU::T1_Z, VT);
 837     case Intrinsic::r600_read_tidig_x:
 838       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 839                                   AMDGPU::T0_X, VT);
 840     case Intrinsic::r600_read_tidig_y:
 841       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 842                                   AMDGPU::T0_Y, VT);
 843     case Intrinsic::r600_read_tidig_z:
 844       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 845                                   AMDGPU::T0_Z, VT);
 846     case Intrinsic::AMDGPU_rsq:
 847       // XXX - I'm assuming SI's RSQ_LEGACY matches R600's behavior.
 848       return DAG.getNode(AMDGPUISD::RSQ_LEGACY, DL, VT, Op.getOperand(1));
 849
 850     case AMDGPUIntrinsic::AMDGPU_fract:
 851     case AMDGPUIntrinsic::AMDIL_fraction: // Legacy name.
 852       return DAG.getNode(AMDGPUISD::FRACT, DL, VT, Op.getOperand(1));
 853     }
 854     // break out of case ISD::INTRINSIC_WO_CHAIN in switch(Op.getOpcode())
 855     break;
 856   }
 857   } // end switch(Op.getOpcode())
 858   return SDValue();
 859 }
 860
 861 void R600TargetLowering::ReplaceNodeResults(SDNode *N,
 862                                             SmallVectorImpl<SDValue> &Results,
 863                                             SelectionDAG &DAG) const {
 864   switch (N->getOpcode()) {
 865   default:
 866     AMDGPUTargetLowering::ReplaceNodeResults(N, Results, DAG);
 867     return;
 868   case ISD::FP_TO_UINT:
 869     if (N->getValueType(0) == MVT::i1) {
 870       Results.push_back(LowerFPTOUINT(N->getOperand(0), DAG));
 871       return;
 872     }
 873     // Fall-through. Since we don't care about out of bounds values
 874     // we can use FP_TO_SINT for uints too. The DAGLegalizer code for uint
 875     // considers some extra cases which are not necessary here.
 876   case ISD::FP_TO_SINT: {
 877     SDValue Result;
 878     if (expandFP_TO_SINT(N, Result, DAG))
 879       Results.push_back(Result);
 880     return;
 881   }
 882   case ISD::UDIV: {
 883     SDValue Op = SDValue(N, 0);
 884     SDLoc DL(Op);
 885     EVT VT = Op.getValueType();
 886     SDValue UDIVREM = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(VT, VT),
 887       N->getOperand(0), N->getOperand(1));
 888     Results.push_back(UDIVREM);
 889     break;
 890   }
 891   case ISD::UREM: {
 892     SDValue Op = SDValue(N, 0);
 893     SDLoc DL(Op);
 894     EVT VT = Op.getValueType();
 895     SDValue UDIVREM = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(VT, VT),
 896       N->getOperand(0), N->getOperand(1));
 897     Results.push_back(UDIVREM.getValue(1));
 898     break;
 899   }
 900   case ISD::SDIV: {
 901     SDValue Op = SDValue(N, 0);
 902     SDLoc DL(Op);
 903     EVT VT = Op.getValueType();
 904     SDValue SDIVREM = DAG.getNode(ISD::SDIVREM, DL, DAG.getVTList(VT, VT),
 905       N->getOperand(0), N->getOperand(1));
 906     Results.push_back(SDIVREM);
 907     break;
 908   }
 909   case ISD::SREM: {
 910     SDValue Op = SDValue(N, 0);
 911     SDLoc DL(Op);
 912     EVT VT = Op.getValueType();
 913     SDValue SDIVREM = DAG.getNode(ISD::SDIVREM, DL, DAG.getVTList(VT, VT),
 914       N->getOperand(0), N->getOperand(1));
 915     Results.push_back(SDIVREM.getValue(1));
 916     break;
 917   }
 918   case ISD::SDIVREM: {
 919     SDValue Op = SDValue(N, 1);
 920     SDValue RES = LowerSDIVREM(Op, DAG);
 921     Results.push_back(RES);
 922     Results.push_back(RES.getValue(1));
 923     break;
 924   }
 925   case ISD::UDIVREM: {
 926     SDValue Op = SDValue(N, 0);
 927     LowerUDIVREM64(Op, DAG, Results);
 928     break;
 929   }
 930   }
 931 }
 932
 933 SDValue R600TargetLowering::vectorToVerticalVector(SelectionDAG &DAG,
 934                                                    SDValue Vector) const {
 935
 936   SDLoc DL(Vector);
 937   EVT VecVT = Vector.getValueType();
 938   EVT EltVT = VecVT.getVectorElementType();
 939   SmallVector<SDValue, 8> Args;
 940
 941   for (unsigned i = 0, e = VecVT.getVectorNumElements();
 942                                                            i != e; ++i) {
 943     Args.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT,
 944                                Vector, DAG.getConstant(i, getVectorIdxTy())));
 945   }
 946
 947   return DAG.getNode(AMDGPUISD::BUILD_VERTICAL_VECTOR, DL, VecVT, Args);
 948 }
 949
 950 SDValue R600TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
 951                                                     SelectionDAG &DAG) const {
 952
 953   SDLoc DL(Op);
 954   SDValue Vector = Op.getOperand(0);
 955   SDValue Index = Op.getOperand(1);
 956
 957   if (isa<ConstantSDNode>(Index) ||
 958       Vector.getOpcode() == AMDGPUISD::BUILD_VERTICAL_VECTOR)
 959     return Op;
 960
 961   Vector = vectorToVerticalVector(DAG, Vector);
 962   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Op.getValueType(),
 963                      Vector, Index);
 964 }
 965
 966 SDValue R600TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
 967                                                    SelectionDAG &DAG) const {
 968   SDLoc DL(Op);
 969   SDValue Vector = Op.getOperand(0);
 970   SDValue Value = Op.getOperand(1);
 971   SDValue Index = Op.getOperand(2);
 972
 973   if (isa<ConstantSDNode>(Index) ||
 974       Vector.getOpcode() == AMDGPUISD::BUILD_VERTICAL_VECTOR)
 975     return Op;
 976
 977   Vector = vectorToVerticalVector(DAG, Vector);
 978   SDValue Insert = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, Op.getValueType(),
 979                                Vector, Value, Index);
 980   return vectorToVerticalVector(DAG, Insert);
 981 }
 982
 983 SDValue R600TargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
 984   // On hw >= R700, COS/SIN input must be between -1. and 1.
 985   // Thus we lower them to TRIG ( FRACT ( x / 2Pi + 0.5) - 0.5)
 986   EVT VT = Op.getValueType();
 987   SDValue Arg = Op.getOperand(0);
 988   SDValue FractPart = DAG.getNode(AMDGPUISD::FRACT, SDLoc(Op), VT,
 989       DAG.getNode(ISD::FADD, SDLoc(Op), VT,
 990         DAG.getNode(ISD::FMUL, SDLoc(Op), VT, Arg,
 991           DAG.getConstantFP(0.15915494309, MVT::f32)),
 992         DAG.getConstantFP(0.5, MVT::f32)));
 993   unsigned TrigNode;
 994   switch (Op.getOpcode()) {
 995   case ISD::FCOS:
 996     TrigNode = AMDGPUISD::COS_HW;
 997     break;
 998   case ISD::FSIN:
 999     TrigNode = AMDGPUISD::SIN_HW;
1000     break;
1001   default:
1002     llvm_unreachable("Wrong trig opcode");
1003   }
1004   SDValue TrigVal = DAG.getNode(TrigNode, SDLoc(Op), VT,
1005       DAG.getNode(ISD::FADD, SDLoc(Op), VT, FractPart,
1006         DAG.getConstantFP(-0.5, MVT::f32)));
1007   if (Gen >= AMDGPUSubtarget::R700)
1008     return TrigVal;
1009   // On R600 hw, COS/SIN input must be between -Pi and Pi.
1010   return DAG.getNode(ISD::FMUL, SDLoc(Op), VT, TrigVal,
1011       DAG.getConstantFP(3.14159265359, MVT::f32));
1012 }
1013
1014 SDValue R600TargetLowering::LowerSHLParts(SDValue Op, SelectionDAG &DAG) const {
1015   SDLoc DL(Op);
1016   EVT VT = Op.getValueType();
1017
1018   SDValue Lo = Op.getOperand(0);
1019   SDValue Hi = Op.getOperand(1);
1020   SDValue Shift = Op.getOperand(2);
1021   SDValue Zero = DAG.getConstant(0, VT);
1022   SDValue One  = DAG.getConstant(1, VT);
1023
1024   SDValue Width  = DAG.getConstant(VT.getSizeInBits(), VT);
1025   SDValue Width1 = DAG.getConstant(VT.getSizeInBits() - 1, VT);
1026   SDValue BigShift  = DAG.getNode(ISD::SUB, DL, VT, Shift, Width);
1027   SDValue CompShift = DAG.getNode(ISD::SUB, DL, VT, Width1, Shift);
1028
1029   // The dance around Width1 is necessary for 0 special case.
1030   // Without it the CompShift might be 32, producing incorrect results in
1031   // Overflow. So we do the shift in two steps, the alternative is to
1032   // add a conditional to filter the special case.
1033
1034   SDValue Overflow = DAG.getNode(ISD::SRL, DL, VT, Lo, CompShift);
1035   Overflow = DAG.getNode(ISD::SRL, DL, VT, Overflow, One);
1036
1037   SDValue HiSmall = DAG.getNode(ISD::SHL, DL, VT, Hi, Shift);
1038   HiSmall = DAG.getNode(ISD::OR, DL, VT, HiSmall, Overflow);
1039   SDValue LoSmall = DAG.getNode(ISD::SHL, DL, VT, Lo, Shift);
1040
1041   SDValue HiBig = DAG.getNode(ISD::SHL, DL, VT, Lo, BigShift);
1042   SDValue LoBig = Zero;
1043
1044   Hi = DAG.getSelectCC(DL, Shift, Width, HiSmall, HiBig, ISD::SETULT);
1045   Lo = DAG.getSelectCC(DL, Shift, Width, LoSmall, LoBig, ISD::SETULT);
1046
1047   return DAG.getNode(ISD::MERGE_VALUES, DL, DAG.getVTList(VT,VT), Lo, Hi);
1048 }
1049
1050 SDValue R600TargetLowering::LowerSRXParts(SDValue Op, SelectionDAG &DAG) const {
1051   SDLoc DL(Op);
1052   EVT VT = Op.getValueType();
1053
1054   SDValue Lo = Op.getOperand(0);
1055   SDValue Hi = Op.getOperand(1);
1056   SDValue Shift = Op.getOperand(2);
1057   SDValue Zero = DAG.getConstant(0, VT);
1058   SDValue One  = DAG.getConstant(1, VT);
1059
1060   const bool SRA = Op.getOpcode() == ISD::SRA_PARTS;
1061
1062   SDValue Width  = DAG.getConstant(VT.getSizeInBits(), VT);
1063   SDValue Width1 = DAG.getConstant(VT.getSizeInBits() - 1, VT);
1064   SDValue BigShift  = DAG.getNode(ISD::SUB, DL, VT, Shift, Width);
1065   SDValue CompShift = DAG.getNode(ISD::SUB, DL, VT, Width1, Shift);
1066
1067   // The dance around Width1 is necessary for 0 special case.
1068   // Without it the CompShift might be 32, producing incorrect results in
1069   // Overflow. So we do the shift in two steps, the alternative is to
1070   // add a conditional to filter the special case.
1071
1072   SDValue Overflow = DAG.getNode(ISD::SHL, DL, VT, Hi, CompShift);
1073   Overflow = DAG.getNode(ISD::SHL, DL, VT, Overflow, One);
1074
1075   SDValue HiSmall = DAG.getNode(SRA ? ISD::SRA : ISD::SRL, DL, VT, Hi, Shift);
1076   SDValue LoSmall = DAG.getNode(ISD::SRL, DL, VT, Lo, Shift);
1077   LoSmall = DAG.getNode(ISD::OR, DL, VT, LoSmall, Overflow);
1078
1079   SDValue LoBig = DAG.getNode(SRA ? ISD::SRA : ISD::SRL, DL, VT, Hi, BigShift);
1080   SDValue HiBig = SRA ? DAG.getNode(ISD::SRA, DL, VT, Hi, Width1) : Zero;
1081
1082   Hi = DAG.getSelectCC(DL, Shift, Width, HiSmall, HiBig, ISD::SETULT);
1083   Lo = DAG.getSelectCC(DL, Shift, Width, LoSmall, LoBig, ISD::SETULT);
1084
1085   return DAG.getNode(ISD::MERGE_VALUES, DL, DAG.getVTList(VT,VT), Lo, Hi);
1086 }
1087
1088 SDValue R600TargetLowering::LowerUADDSUBO(SDValue Op, SelectionDAG &DAG,
1089                                           unsigned mainop, unsigned ovf) const {
1090   SDLoc DL(Op);
1091   EVT VT = Op.getValueType();
1092
1093   SDValue Lo = Op.getOperand(0);
1094   SDValue Hi = Op.getOperand(1);
1095
1096   SDValue OVF = DAG.getNode(ovf, DL, VT, Lo, Hi);
1097   // Extend sign.
1098   OVF = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, OVF,
1099                     DAG.getValueType(MVT::i1));
1100
1101   SDValue Res = DAG.getNode(mainop, DL, VT, Lo, Hi);
1102
1103   return DAG.getNode(ISD::MERGE_VALUES, DL, DAG.getVTList(VT, VT), Res, OVF);
1104 }
1105
1106 SDValue R600TargetLowering::LowerFPTOUINT(SDValue Op, SelectionDAG &DAG) const {
1107   return DAG.getNode(
1108       ISD::SETCC,
1109       SDLoc(Op),
1110       MVT::i1,
1111       Op, DAG.getConstantFP(0.0f, MVT::f32),
1112       DAG.getCondCode(ISD::SETNE)
1113       );
1114 }
1115
1116 SDValue R600TargetLowering::LowerImplicitParameter(SelectionDAG &DAG, EVT VT,
1117                                                    SDLoc DL,
1118                                                    unsigned DwordOffset) const {
1119   unsigned ByteOffset = DwordOffset * 4;
1120   PointerType * PtrType = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
1121                                       AMDGPUAS::CONSTANT_BUFFER_0);
1122
1123   // We shouldn't be using an offset wider than 16-bits for implicit parameters.
1124   assert(isInt<16>(ByteOffset));
1125
1126   return DAG.getLoad(VT, DL, DAG.getEntryNode(),
1127                      DAG.getConstant(ByteOffset, MVT::i32), // PTR
1128                      MachinePointerInfo(ConstantPointerNull::get(PtrType)),
1129                      false, false, false, 0);
1130 }
1131
1132 bool R600TargetLowering::isZero(SDValue Op) const {
1133   if(ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
1134     return Cst->isNullValue();
1135   } else if(ConstantFPSDNode *CstFP = dyn_cast<ConstantFPSDNode>(Op)){
1136     return CstFP->isZero();
1137   } else {
1138     return false;
1139   }
1140 }
1141
1142 SDValue R600TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
1143   SDLoc DL(Op);
1144   EVT VT = Op.getValueType();
1145
1146   SDValue LHS = Op.getOperand(0);
1147   SDValue RHS = Op.getOperand(1);
1148   SDValue True = Op.getOperand(2);
1149   SDValue False = Op.getOperand(3);
1150   SDValue CC = Op.getOperand(4);
1151   SDValue Temp;
1152
1153   if (VT == MVT::f32) {
1154     DAGCombinerInfo DCI(DAG, AfterLegalizeVectorOps, true, nullptr);
1155     SDValue MinMax = CombineFMinMaxLegacy(DL, VT, LHS, RHS, True, False, CC, DCI);
1156     if (MinMax)
1157       return MinMax;
1158   }
1159
1160   // LHS and RHS are guaranteed to be the same value type
1161   EVT CompareVT = LHS.getValueType();
1162
1163   // Check if we can lower this to a native operation.
1164
1165   // Try to lower to a SET* instruction:
1166   //
1167   // SET* can match the following patterns:
1168   //
1169   // select_cc f32, f32, -1,  0, cc_supported
1170   // select_cc f32, f32, 1.0f, 0.0f, cc_supported
1171   // select_cc i32, i32, -1,  0, cc_supported
1172   //
1173
1174   // Move hardware True/False values to the correct operand.
1175   ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
1176   ISD::CondCode InverseCC =
1177      ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32);
1178   if (isHWTrueValue(False) && isHWFalseValue(True)) {
1179     if (isCondCodeLegal(InverseCC, CompareVT.getSimpleVT())) {
1180       std::swap(False, True);
1181       CC = DAG.getCondCode(InverseCC);
1182     } else {
1183       ISD::CondCode SwapInvCC = ISD::getSetCCSwappedOperands(InverseCC);
1184       if (isCondCodeLegal(SwapInvCC, CompareVT.getSimpleVT())) {
1185         std::swap(False, True);
1186         std::swap(LHS, RHS);
1187         CC = DAG.getCondCode(SwapInvCC);
1188       }
1189     }
1190   }
1191
1192   if (isHWTrueValue(True) && isHWFalseValue(False) &&
1193       (CompareVT == VT || VT == MVT::i32)) {
1194     // This can be matched by a SET* instruction.
1195     return DAG.getNode(ISD::SELECT_CC, DL, VT, LHS, RHS, True, False, CC);
1196   }
1197
1198   // Try to lower to a CND* instruction:
1199   //
1200   // CND* can match the following patterns:
1201   //
1202   // select_cc f32, 0.0, f32, f32, cc_supported
1203   // select_cc f32, 0.0, i32, i32, cc_supported
1204   // select_cc i32, 0,   f32, f32, cc_supported
1205   // select_cc i32, 0,   i32, i32, cc_supported
1206   //
1207
1208   // Try to move the zero value to the RHS
1209   if (isZero(LHS)) {
1210     ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
1211     // Try swapping the operands
1212     ISD::CondCode CCSwapped = ISD::getSetCCSwappedOperands(CCOpcode);
1213     if (isCondCodeLegal(CCSwapped, CompareVT.getSimpleVT())) {
1214       std::swap(LHS, RHS);
1215       CC = DAG.getCondCode(CCSwapped);
1216     } else {
1217       // Try inverting the conditon and then swapping the operands
1218       ISD::CondCode CCInv = ISD::getSetCCInverse(CCOpcode, CompareVT.isInteger());
1219       CCSwapped = ISD::getSetCCSwappedOperands(CCInv);
1220       if (isCondCodeLegal(CCSwapped, CompareVT.getSimpleVT())) {
1221         std::swap(True, False);
1222         std::swap(LHS, RHS);
1223         CC = DAG.getCondCode(CCSwapped);
1224       }
1225     }
1226   }
1227   if (isZero(RHS)) {
1228     SDValue Cond = LHS;
1229     SDValue Zero = RHS;
1230     ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
1231     if (CompareVT != VT) {
1232       // Bitcast True / False to the correct types.  This will end up being
1233       // a nop, but it allows us to define only a single pattern in the
1234       // .TD files for each CND* instruction rather than having to have
1235       // one pattern for integer True/False and one for fp True/False
1236       True = DAG.getNode(ISD::BITCAST, DL, CompareVT, True);
1237       False = DAG.getNode(ISD::BITCAST, DL, CompareVT, False);
1238     }
1239
1240     switch (CCOpcode) {
1241     case ISD::SETONE:
1242     case ISD::SETUNE:
1243     case ISD::SETNE:
1244       CCOpcode = ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32);
1245       Temp = True;
1246       True = False;
1247       False = Temp;
1248       break;
1249     default:
1250       break;
1251     }
1252     SDValue SelectNode = DAG.getNode(ISD::SELECT_CC, DL, CompareVT,
1253         Cond, Zero,
1254         True, False,
1255         DAG.getCondCode(CCOpcode));
1256     return DAG.getNode(ISD::BITCAST, DL, VT, SelectNode);
1257   }
1258
1259   // If we make it this for it means we have no native instructions to handle
1260   // this SELECT_CC, so we must lower it.
1261   SDValue HWTrue, HWFalse;
1262
1263   if (CompareVT == MVT::f32) {
1264     HWTrue = DAG.getConstantFP(1.0f, CompareVT);
1265     HWFalse = DAG.getConstantFP(0.0f, CompareVT);
1266   } else if (CompareVT == MVT::i32) {
1267     HWTrue = DAG.getConstant(-1, CompareVT);
1268     HWFalse = DAG.getConstant(0, CompareVT);
1269   }
1270   else {
1271     llvm_unreachable("Unhandled value type in LowerSELECT_CC");
1272   }
1273
1274   // Lower this unsupported SELECT_CC into a combination of two supported
1275   // SELECT_CC operations.
1276   SDValue Cond = DAG.getNode(ISD::SELECT_CC, DL, CompareVT, LHS, RHS, HWTrue, HWFalse, CC);
1277
1278   return DAG.getNode(ISD::SELECT_CC, DL, VT,
1279       Cond, HWFalse,
1280       True, False,
1281       DAG.getCondCode(ISD::SETNE));
1282 }
1283
1284 /// LLVM generates byte-addressed pointers.  For indirect addressing, we need to
1285 /// convert these pointers to a register index.  Each register holds
1286 /// 16 bytes, (4 x 32bit sub-register), but we need to take into account the
1287 /// \p StackWidth, which tells us how many of the 4 sub-registrers will be used
1288 /// for indirect addressing.
1289 SDValue R600TargetLowering::stackPtrToRegIndex(SDValue Ptr,
1290                                                unsigned StackWidth,
1291                                                SelectionDAG &DAG) const {
1292   unsigned SRLPad;
1293   switch(StackWidth) {
1294   case 1:
1295     SRLPad = 2;
1296     break;
1297   case 2:
1298     SRLPad = 3;
1299     break;
1300   case 4:
1301     SRLPad = 4;
1302     break;
1303   default: llvm_unreachable("Invalid stack width");
1304   }
1305
1306   return DAG.getNode(ISD::SRL, SDLoc(Ptr), Ptr.getValueType(), Ptr,
1307                      DAG.getConstant(SRLPad, MVT::i32));
1308 }
1309
1310 void R600TargetLowering::getStackAddress(unsigned StackWidth,
1311                                          unsigned ElemIdx,
1312                                          unsigned &Channel,
1313                                          unsigned &PtrIncr) const {
1314   switch (StackWidth) {
1315   default:
1316   case 1:
1317     Channel = 0;
1318     if (ElemIdx > 0) {
1319       PtrIncr = 1;
1320     } else {
1321       PtrIncr = 0;
1322     }
1323     break;
1324   case 2:
1325     Channel = ElemIdx % 2;
1326     if (ElemIdx == 2) {
1327       PtrIncr = 1;
1328     } else {
1329       PtrIncr = 0;
1330     }
1331     break;
1332   case 4:
1333     Channel = ElemIdx;
1334     PtrIncr = 0;
1335     break;
1336   }
1337 }
1338
1339 SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
1340   SDLoc DL(Op);
1341   StoreSDNode *StoreNode = cast<StoreSDNode>(Op);
1342   SDValue Chain = Op.getOperand(0);
1343   SDValue Value = Op.getOperand(1);
1344   SDValue Ptr = Op.getOperand(2);
1345
1346   SDValue Result = AMDGPUTargetLowering::LowerSTORE(Op, DAG);
1347   if (Result.getNode()) {
1348     return Result;
1349   }
1350
1351   if (StoreNode->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS) {
1352     if (StoreNode->isTruncatingStore()) {
1353       EVT VT = Value.getValueType();
1354       assert(VT.bitsLE(MVT::i32));
1355       EVT MemVT = StoreNode->getMemoryVT();
1356       SDValue MaskConstant;
1357       if (MemVT == MVT::i8) {
1358         MaskConstant = DAG.getConstant(0xFF, MVT::i32);
1359       } else {
1360         assert(MemVT == MVT::i16);
1361         MaskConstant = DAG.getConstant(0xFFFF, MVT::i32);
1362       }
1363       SDValue DWordAddr = DAG.getNode(ISD::SRL, DL, VT, Ptr,
1364                                       DAG.getConstant(2, MVT::i32));
1365       SDValue ByteIndex = DAG.getNode(ISD::AND, DL, Ptr.getValueType(), Ptr,
1366                                       DAG.getConstant(0x00000003, VT));
1367       SDValue TruncValue = DAG.getNode(ISD::AND, DL, VT, Value, MaskConstant);
1368       SDValue Shift = DAG.getNode(ISD::SHL, DL, VT, ByteIndex,
1369                                    DAG.getConstant(3, VT));
1370       SDValue ShiftedValue = DAG.getNode(ISD::SHL, DL, VT, TruncValue, Shift);
1371       SDValue Mask = DAG.getNode(ISD::SHL, DL, VT, MaskConstant, Shift);
1372       // XXX: If we add a 64-bit ZW register class, then we could use a 2 x i32
1373       // vector instead.
1374       SDValue Src[4] = {
1375         ShiftedValue,
1376         DAG.getConstant(0, MVT::i32),
1377         DAG.getConstant(0, MVT::i32),
1378         Mask
1379       };
1380       SDValue Input = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v4i32, Src);
1381       SDValue Args[3] = { Chain, Input, DWordAddr };
1382       return DAG.getMemIntrinsicNode(AMDGPUISD::STORE_MSKOR, DL,
1383                                      Op->getVTList(), Args, MemVT,
1384                                      StoreNode->getMemOperand());
1385     } else if (Ptr->getOpcode() != AMDGPUISD::DWORDADDR &&
1386                Value.getValueType().bitsGE(MVT::i32)) {
1387       // Convert pointer from byte address to dword address.
1388       Ptr = DAG.getNode(AMDGPUISD::DWORDADDR, DL, Ptr.getValueType(),
1389                         DAG.getNode(ISD::SRL, DL, Ptr.getValueType(),
1390                                     Ptr, DAG.getConstant(2, MVT::i32)));
1391
1392       if (StoreNode->isTruncatingStore() || StoreNode->isIndexed()) {
1393         llvm_unreachable("Truncated and indexed stores not supported yet");
1394       } else {
1395         Chain = DAG.getStore(Chain, DL, Value, Ptr, StoreNode->getMemOperand());
1396       }
1397       return Chain;
1398     }
1399   }
1400
1401   EVT ValueVT = Value.getValueType();
1402
1403   if (StoreNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) {
1404     return SDValue();
1405   }
1406
1407   SDValue Ret = AMDGPUTargetLowering::LowerSTORE(Op, DAG);
1408   if (Ret.getNode()) {
1409     return Ret;
1410   }
1411   // Lowering for indirect addressing
1412
1413   const MachineFunction &MF = DAG.getMachineFunction();
1414   const AMDGPUFrameLowering *TFL =
1415       static_cast<const AMDGPUFrameLowering *>(Subtarget->getFrameLowering());
1416   unsigned StackWidth = TFL->getStackWidth(MF);
1417
1418   Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG);
1419
1420   if (ValueVT.isVector()) {
1421     unsigned NumElemVT = ValueVT.getVectorNumElements();
1422     EVT ElemVT = ValueVT.getVectorElementType();
1423     SmallVector<SDValue, 4> Stores(NumElemVT);
1424
1425     assert(NumElemVT >= StackWidth && "Stack width cannot be greater than "
1426                                       "vector width in load");
1427
1428     for (unsigned i = 0; i < NumElemVT; ++i) {
1429       unsigned Channel, PtrIncr;
1430       getStackAddress(StackWidth, i, Channel, PtrIncr);
1431       Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr,
1432                         DAG.getConstant(PtrIncr, MVT::i32));
1433       SDValue Elem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ElemVT,
1434                                  Value, DAG.getConstant(i, MVT::i32));
1435
1436       Stores[i] = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other,
1437                               Chain, Elem, Ptr,
1438                               DAG.getTargetConstant(Channel, MVT::i32));
1439     }
1440      Chain =  DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores);
1441    } else {
1442     if (ValueVT == MVT::i8) {
1443       Value = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Value);
1444     }
1445     Chain = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other, Chain, Value, Ptr,
1446     DAG.getTargetConstant(0, MVT::i32)); // Channel
1447   }
1448
1449   return Chain;
1450 }
1451
1452 // return (512 + (kc_bank << 12)
1453 static int
1454 ConstantAddressBlock(unsigned AddressSpace) {
1455   switch (AddressSpace) {
1456   case AMDGPUAS::CONSTANT_BUFFER_0:
1457     return 512;
1458   case AMDGPUAS::CONSTANT_BUFFER_1:
1459     return 512 + 4096;
1460   case AMDGPUAS::CONSTANT_BUFFER_2:
1461     return 512 + 4096 * 2;
1462   case AMDGPUAS::CONSTANT_BUFFER_3:
1463     return 512 + 4096 * 3;
1464   case AMDGPUAS::CONSTANT_BUFFER_4:
1465     return 512 + 4096 * 4;
1466   case AMDGPUAS::CONSTANT_BUFFER_5:
1467     return 512 + 4096 * 5;
1468   case AMDGPUAS::CONSTANT_BUFFER_6:
1469     return 512 + 4096 * 6;
1470   case AMDGPUAS::CONSTANT_BUFFER_7:
1471     return 512 + 4096 * 7;
1472   case AMDGPUAS::CONSTANT_BUFFER_8:
1473     return 512 + 4096 * 8;
1474   case AMDGPUAS::CONSTANT_BUFFER_9:
1475     return 512 + 4096 * 9;
1476   case AMDGPUAS::CONSTANT_BUFFER_10:
1477     return 512 + 4096 * 10;
1478   case AMDGPUAS::CONSTANT_BUFFER_11:
1479     return 512 + 4096 * 11;
1480   case AMDGPUAS::CONSTANT_BUFFER_12:
1481     return 512 + 4096 * 12;
1482   case AMDGPUAS::CONSTANT_BUFFER_13:
1483     return 512 + 4096 * 13;
1484   case AMDGPUAS::CONSTANT_BUFFER_14:
1485     return 512 + 4096 * 14;
1486   case AMDGPUAS::CONSTANT_BUFFER_15:
1487     return 512 + 4096 * 15;
1488   default:
1489     return -1;
1490   }
1491 }
1492
1493 SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const
1494 {
1495   EVT VT = Op.getValueType();
1496   SDLoc DL(Op);
1497   LoadSDNode *LoadNode = cast<LoadSDNode>(Op);
1498   SDValue Chain = Op.getOperand(0);
1499   SDValue Ptr = Op.getOperand(1);
1500   SDValue LoweredLoad;
1501
1502   SDValue Ret = AMDGPUTargetLowering::LowerLOAD(Op, DAG);
1503   if (Ret.getNode()) {
1504     SDValue Ops[2] = {
1505       Ret,
1506       Chain
1507     };
1508     return DAG.getMergeValues(Ops, DL);
1509   }
1510
1511   // Lower loads constant address space global variable loads
1512   if (LoadNode->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS &&
1513       isa<GlobalVariable>(GetUnderlyingObject(
1514           LoadNode->getMemOperand()->getValue(), *getDataLayout()))) {
1515
1516     SDValue Ptr = DAG.getZExtOrTrunc(LoadNode->getBasePtr(), DL,
1517         getPointerTy(AMDGPUAS::PRIVATE_ADDRESS));
1518     Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr,
1519         DAG.getConstant(2, MVT::i32));
1520     return DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, Op->getVTList(),
1521                        LoadNode->getChain(), Ptr,
1522                        DAG.getTargetConstant(0, MVT::i32), Op.getOperand(2));
1523   }
1524
1525   if (LoadNode->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS && VT.isVector()) {
1526     SDValue MergedValues[2] = {
1527       ScalarizeVectorLoad(Op, DAG),
1528       Chain
1529     };
1530     return DAG.getMergeValues(MergedValues, DL);
1531   }
1532
1533   int ConstantBlock = ConstantAddressBlock(LoadNode->getAddressSpace());
1534   if (ConstantBlock > -1 &&
1535       ((LoadNode->getExtensionType() == ISD::NON_EXTLOAD) ||
1536        (LoadNode->getExtensionType() == ISD::ZEXTLOAD))) {
1537     SDValue Result;
1538     if (isa<ConstantExpr>(LoadNode->getMemOperand()->getValue()) ||
1539         isa<Constant>(LoadNode->getMemOperand()->getValue()) ||
1540         isa<ConstantSDNode>(Ptr)) {
1541       SDValue Slots[4];
1542       for (unsigned i = 0; i < 4; i++) {
1543         // We want Const position encoded with the following formula :
1544         // (((512 + (kc_bank << 12) + const_index) << 2) + chan)
1545         // const_index is Ptr computed by llvm using an alignment of 16.
1546         // Thus we add (((512 + (kc_bank << 12)) + chan ) * 4 here and
1547         // then div by 4 at the ISel step
1548         SDValue NewPtr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
1549             DAG.getConstant(4 * i + ConstantBlock * 16, MVT::i32));
1550         Slots[i] = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::i32, NewPtr);
1551       }
1552       EVT NewVT = MVT::v4i32;
1553       unsigned NumElements = 4;
1554       if (VT.isVector()) {
1555         NewVT = VT;
1556         NumElements = VT.getVectorNumElements();
1557       }
1558       Result = DAG.getNode(ISD::BUILD_VECTOR, DL, NewVT,
1559                            makeArrayRef(Slots, NumElements));
1560     } else {
1561       // non-constant ptr can't be folded, keeps it as a v4f32 load
1562       Result = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::v4i32,
1563           DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr, DAG.getConstant(4, MVT::i32)),
1564           DAG.getConstant(LoadNode->getAddressSpace() -
1565                           AMDGPUAS::CONSTANT_BUFFER_0, MVT::i32)
1566           );
1567     }
1568
1569     if (!VT.isVector()) {
1570       Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Result,
1571           DAG.getConstant(0, MVT::i32));
1572     }
1573
1574     SDValue MergedValues[2] = {
1575       Result,
1576       Chain
1577     };
1578     return DAG.getMergeValues(MergedValues, DL);
1579   }
1580
1581   // For most operations returning SDValue() will result in the node being
1582   // expanded by the DAG Legalizer. This is not the case for ISD::LOAD, so we
1583   // need to manually expand loads that may be legal in some address spaces and
1584   // illegal in others. SEXT loads from CONSTANT_BUFFER_0 are supported for
1585   // compute shaders, since the data is sign extended when it is uploaded to the
1586   // buffer. However SEXT loads from other address spaces are not supported, so
1587   // we need to expand them here.
1588   if (LoadNode->getExtensionType() == ISD::SEXTLOAD) {
1589     EVT MemVT = LoadNode->getMemoryVT();
1590     assert(!MemVT.isVector() && (MemVT == MVT::i16 || MemVT == MVT::i8));
1591     SDValue ShiftAmount =
1592           DAG.getConstant(VT.getSizeInBits() - MemVT.getSizeInBits(), MVT::i32);
1593     SDValue NewLoad = DAG.getExtLoad(ISD::EXTLOAD, DL, VT, Chain, Ptr,
1594                                   LoadNode->getPointerInfo(), MemVT,
1595                                   LoadNode->isVolatile(),
1596                                   LoadNode->isNonTemporal(),
1597                                   LoadNode->isInvariant(),
1598                                   LoadNode->getAlignment());
1599     SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, NewLoad, ShiftAmount);
1600     SDValue Sra = DAG.getNode(ISD::SRA, DL, VT, Shl, ShiftAmount);
1601
1602     SDValue MergedValues[2] = { Sra, Chain };
1603     return DAG.getMergeValues(MergedValues, DL);
1604   }
1605
1606   if (LoadNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) {
1607     return SDValue();
1608   }
1609
1610   // Lowering for indirect addressing
1611   const MachineFunction &MF = DAG.getMachineFunction();
1612   const AMDGPUFrameLowering *TFL =
1613       static_cast<const AMDGPUFrameLowering *>(Subtarget->getFrameLowering());
1614   unsigned StackWidth = TFL->getStackWidth(MF);
1615
1616   Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG);
1617
1618   if (VT.isVector()) {
1619     unsigned NumElemVT = VT.getVectorNumElements();
1620     EVT ElemVT = VT.getVectorElementType();
1621     SDValue Loads[4];
1622
1623     assert(NumElemVT >= StackWidth && "Stack width cannot be greater than "
1624                                       "vector width in load");
1625
1626     for (unsigned i = 0; i < NumElemVT; ++i) {
1627       unsigned Channel, PtrIncr;
1628       getStackAddress(StackWidth, i, Channel, PtrIncr);
1629       Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr,
1630                         DAG.getConstant(PtrIncr, MVT::i32));
1631       Loads[i] = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, ElemVT,
1632                              Chain, Ptr,
1633                              DAG.getTargetConstant(Channel, MVT::i32),
1634                              Op.getOperand(2));
1635     }
1636     for (unsigned i = NumElemVT; i < 4; ++i) {
1637       Loads[i] = DAG.getUNDEF(ElemVT);
1638     }
1639     EVT TargetVT = EVT::getVectorVT(*DAG.getContext(), ElemVT, 4);
1640     LoweredLoad = DAG.getNode(ISD::BUILD_VECTOR, DL, TargetVT, Loads);
1641   } else {
1642     LoweredLoad = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, VT,
1643                               Chain, Ptr,
1644                               DAG.getTargetConstant(0, MVT::i32), // Channel
1645                               Op.getOperand(2));
1646   }
1647
1648   SDValue Ops[2] = {
1649     LoweredLoad,
1650     Chain
1651   };
1652
1653   return DAG.getMergeValues(Ops, DL);
1654 }
1655
1656 SDValue R600TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
1657   SDValue Chain = Op.getOperand(0);
1658   SDValue Cond  = Op.getOperand(1);
1659   SDValue Jump  = Op.getOperand(2);
1660
1661   return DAG.getNode(AMDGPUISD::BRANCH_COND, SDLoc(Op), Op.getValueType(),
1662                      Chain, Jump, Cond);
1663 }
1664
1665 /// XXX Only kernel functions are supported, so we can assume for now that
1666 /// every function is a kernel function, but in the future we should use
1667 /// separate calling conventions for kernel and non-kernel functions.
1668 SDValue R600TargetLowering::LowerFormalArguments(
1669                                       SDValue Chain,
1670                                       CallingConv::ID CallConv,
1671                                       bool isVarArg,
1672                                       const SmallVectorImpl<ISD::InputArg> &Ins,
1673                                       SDLoc DL, SelectionDAG &DAG,
1674                                       SmallVectorImpl<SDValue> &InVals) const {
1675   SmallVector<CCValAssign, 16> ArgLocs;
1676   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
1677                  *DAG.getContext());
1678   MachineFunction &MF = DAG.getMachineFunction();
1679   R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
1680
1681   SmallVector<ISD::InputArg, 8> LocalIns;
1682
1683   getOriginalFunctionArgs(DAG, MF.getFunction(), Ins, LocalIns);
1684
1685   AnalyzeFormalArguments(CCInfo, LocalIns);
1686
1687   for (unsigned i = 0, e = Ins.size(); i < e; ++i) {
1688     CCValAssign &VA = ArgLocs[i];
1689     const ISD::InputArg &In = Ins[i];
1690     EVT VT = In.VT;
1691     EVT MemVT = VA.getLocVT();
1692     if (!VT.isVector() && MemVT.isVector()) {
1693       // Get load source type if scalarized.
1694       MemVT = MemVT.getVectorElementType();
1695     }
1696
1697     if (MFI->getShaderType() != ShaderType::COMPUTE) {
1698       unsigned Reg = MF.addLiveIn(VA.getLocReg(), &AMDGPU::R600_Reg128RegClass);
1699       SDValue Register = DAG.getCopyFromReg(Chain, DL, Reg, VT);
1700       InVals.push_back(Register);
1701       continue;
1702     }
1703
1704     PointerType *PtrTy = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
1705                                           AMDGPUAS::CONSTANT_BUFFER_0);
1706
1707     // i64 isn't a legal type, so the register type used ends up as i32, which
1708     // isn't expected here. It attempts to create this sextload, but it ends up
1709     // being invalid. Somehow this seems to work with i64 arguments, but breaks
1710     // for <1 x i64>.
1711
1712     // The first 36 bytes of the input buffer contains information about
1713     // thread group and global sizes.
1714     ISD::LoadExtType Ext = ISD::NON_EXTLOAD;
1715     if (MemVT.getScalarSizeInBits() != VT.getScalarSizeInBits()) {
1716       // FIXME: This should really check the extload type, but the handling of
1717       // extload vector parameters seems to be broken.
1718
1719       // Ext = In.Flags.isSExt() ? ISD::SEXTLOAD : ISD::ZEXTLOAD;
1720       Ext = ISD::SEXTLOAD;
1721     }
1722
1723     // Compute the offset from the value.
1724     // XXX - I think PartOffset should give you this, but it seems to give the
1725     // size of the register which isn't useful.
1726
1727     unsigned ValBase = ArgLocs[In.getOrigArgIndex()].getLocMemOffset();
1728     unsigned PartOffset = VA.getLocMemOffset();
1729     unsigned Offset = 36 + VA.getLocMemOffset();
1730
1731     MachinePointerInfo PtrInfo(UndefValue::get(PtrTy), PartOffset - ValBase);
1732     SDValue Arg = DAG.getLoad(ISD::UNINDEXED, Ext, VT, DL, Chain,
1733                               DAG.getConstant(Offset, MVT::i32),
1734                               DAG.getUNDEF(MVT::i32),
1735                               PtrInfo,
1736                               MemVT, false, true, true, 4);
1737
1738     // 4 is the preferred alignment for the CONSTANT memory space.
1739     InVals.push_back(Arg);
1740     MFI->ABIArgOffset = Offset + MemVT.getStoreSize();
1741   }
1742   return Chain;
1743 }
1744
1745 EVT R600TargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const {
1746    if (!VT.isVector())
1747      return MVT::i32;
1748    return VT.changeVectorElementTypeToInteger();
1749 }
1750
1751 static SDValue CompactSwizzlableVector(
1752   SelectionDAG &DAG, SDValue VectorEntry,
1753   DenseMap<unsigned, unsigned> &RemapSwizzle) {
1754   assert(VectorEntry.getOpcode() == ISD::BUILD_VECTOR);
1755   assert(RemapSwizzle.empty());
1756   SDValue NewBldVec[4] = {
1757     VectorEntry.getOperand(0),
1758     VectorEntry.getOperand(1),
1759     VectorEntry.getOperand(2),
1760     VectorEntry.getOperand(3)
1761   };
1762
1763   for (unsigned i = 0; i < 4; i++) {
1764     if (NewBldVec[i].getOpcode() == ISD::UNDEF)
1765       // We mask write here to teach later passes that the ith element of this
1766       // vector is undef. Thus we can use it to reduce 128 bits reg usage,
1767       // break false dependencies and additionnaly make assembly easier to read.
1768       RemapSwizzle[i] = 7; // SEL_MASK_WRITE
1769     if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(NewBldVec[i])) {
1770       if (C->isZero()) {
1771         RemapSwizzle[i] = 4; // SEL_0
1772         NewBldVec[i] = DAG.getUNDEF(MVT::f32);
1773       } else if (C->isExactlyValue(1.0)) {
1774         RemapSwizzle[i] = 5; // SEL_1
1775         NewBldVec[i] = DAG.getUNDEF(MVT::f32);
1776       }
1777     }
1778
1779     if (NewBldVec[i].getOpcode() == ISD::UNDEF)
1780       continue;
1781     for (unsigned j = 0; j < i; j++) {
1782       if (NewBldVec[i] == NewBldVec[j]) {
1783         NewBldVec[i] = DAG.getUNDEF(NewBldVec[i].getValueType());
1784         RemapSwizzle[i] = j;
1785         break;
1786       }
1787     }
1788   }
1789
1790   return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(VectorEntry),
1791                      VectorEntry.getValueType(), NewBldVec);
1792 }
1793
1794 static SDValue ReorganizeVector(SelectionDAG &DAG, SDValue VectorEntry,
1795                                 DenseMap<unsigned, unsigned> &RemapSwizzle) {
1796   assert(VectorEntry.getOpcode() == ISD::BUILD_VECTOR);
1797   assert(RemapSwizzle.empty());
1798   SDValue NewBldVec[4] = {
1799       VectorEntry.getOperand(0),
1800       VectorEntry.getOperand(1),
1801       VectorEntry.getOperand(2),
1802       VectorEntry.getOperand(3)
1803   };
1804   bool isUnmovable[4] = { false, false, false, false };
1805   for (unsigned i = 0; i < 4; i++) {
1806     RemapSwizzle[i] = i;
1807     if (NewBldVec[i].getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
1808       unsigned Idx = dyn_cast<ConstantSDNode>(NewBldVec[i].getOperand(1))
1809           ->getZExtValue();
1810       if (i == Idx)
1811         isUnmovable[Idx] = true;
1812     }
1813   }
1814
1815   for (unsigned i = 0; i < 4; i++) {
1816     if (NewBldVec[i].getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
1817       unsigned Idx = dyn_cast<ConstantSDNode>(NewBldVec[i].getOperand(1))
1818           ->getZExtValue();
1819       if (isUnmovable[Idx])
1820         continue;
1821       // Swap i and Idx
1822       std::swap(NewBldVec[Idx], NewBldVec[i]);
1823       std::swap(RemapSwizzle[i], RemapSwizzle[Idx]);
1824       break;
1825     }
1826   }
1827
1828   return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(VectorEntry),
1829                      VectorEntry.getValueType(), NewBldVec);
1830 }
1831
1832
1833 SDValue R600TargetLowering::OptimizeSwizzle(SDValue BuildVector,
1834 SDValue Swz[4], SelectionDAG &DAG) const {
1835   assert(BuildVector.getOpcode() == ISD::BUILD_VECTOR);
1836   // Old -> New swizzle values
1837   DenseMap<unsigned, unsigned> SwizzleRemap;
1838
1839   BuildVector = CompactSwizzlableVector(DAG, BuildVector, SwizzleRemap);
1840   for (unsigned i = 0; i < 4; i++) {
1841     unsigned Idx = cast<ConstantSDNode>(Swz[i])->getZExtValue();
1842     if (SwizzleRemap.find(Idx) != SwizzleRemap.end())
1843       Swz[i] = DAG.getConstant(SwizzleRemap[Idx], MVT::i32);
1844   }
1845
1846   SwizzleRemap.clear();
1847   BuildVector = ReorganizeVector(DAG, BuildVector, SwizzleRemap);
1848   for (unsigned i = 0; i < 4; i++) {
1849     unsigned Idx = cast<ConstantSDNode>(Swz[i])->getZExtValue();
1850     if (SwizzleRemap.find(Idx) != SwizzleRemap.end())
1851       Swz[i] = DAG.getConstant(SwizzleRemap[Idx], MVT::i32);
1852   }
1853
1854   return BuildVector;
1855 }
1856
1857
1858 //===----------------------------------------------------------------------===//
1859 // Custom DAG Optimizations
1860 //===----------------------------------------------------------------------===//
1861
1862 SDValue R600TargetLowering::PerformDAGCombine(SDNode *N,
1863                                               DAGCombinerInfo &DCI) const {
1864   SelectionDAG &DAG = DCI.DAG;
1865
1866   switch (N->getOpcode()) {
1867   default: return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
1868   // (f32 fp_round (f64 uint_to_fp a)) -> (f32 uint_to_fp a)
1869   case ISD::FP_ROUND: {
1870       SDValue Arg = N->getOperand(0);
1871       if (Arg.getOpcode() == ISD::UINT_TO_FP && Arg.getValueType() == MVT::f64) {
1872         return DAG.getNode(ISD::UINT_TO_FP, SDLoc(N), N->getValueType(0),
1873                            Arg.getOperand(0));
1874       }
1875       break;
1876     }
1877
1878   // (i32 fp_to_sint (fneg (select_cc f32, f32, 1.0, 0.0 cc))) ->
1879   // (i32 select_cc f32, f32, -1, 0 cc)
1880   //
1881   // Mesa's GLSL frontend generates the above pattern a lot and we can lower
1882   // this to one of the SET*_DX10 instructions.
1883   case ISD::FP_TO_SINT: {
1884     SDValue FNeg = N->getOperand(0);
1885     if (FNeg.getOpcode() != ISD::FNEG) {
1886       return SDValue();
1887     }
1888     SDValue SelectCC = FNeg.getOperand(0);
1889     if (SelectCC.getOpcode() != ISD::SELECT_CC ||
1890         SelectCC.getOperand(0).getValueType() != MVT::f32 || // LHS
1891         SelectCC.getOperand(2).getValueType() != MVT::f32 || // True
1892         !isHWTrueValue(SelectCC.getOperand(2)) ||
1893         !isHWFalseValue(SelectCC.getOperand(3))) {
1894       return SDValue();
1895     }
1896
1897     return DAG.getNode(ISD::SELECT_CC, SDLoc(N), N->getValueType(0),
1898                            SelectCC.getOperand(0), // LHS
1899                            SelectCC.getOperand(1), // RHS
1900                            DAG.getConstant(-1, MVT::i32), // True
1901                            DAG.getConstant(0, MVT::i32),  // False
1902                            SelectCC.getOperand(4)); // CC
1903
1904     break;
1905   }
1906
1907   // insert_vector_elt (build_vector elt0, ... , eltN), NewEltIdx, idx
1908   // => build_vector elt0, ... , NewEltIdx, ... , eltN
1909   case ISD::INSERT_VECTOR_ELT: {
1910     SDValue InVec = N->getOperand(0);
1911     SDValue InVal = N->getOperand(1);
1912     SDValue EltNo = N->getOperand(2);
1913     SDLoc dl(N);
1914
1915     // If the inserted element is an UNDEF, just use the input vector.
1916     if (InVal.getOpcode() == ISD::UNDEF)
1917       return InVec;
1918
1919     EVT VT = InVec.getValueType();
1920
1921     // If we can't generate a legal BUILD_VECTOR, exit
1922     if (!isOperationLegal(ISD::BUILD_VECTOR, VT))
1923       return SDValue();
1924
1925     // Check that we know which element is being inserted
1926     if (!isa<ConstantSDNode>(EltNo))
1927       return SDValue();
1928     unsigned Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
1929
1930     // Check that the operand is a BUILD_VECTOR (or UNDEF, which can essentially
1931     // be converted to a BUILD_VECTOR).  Fill in the Ops vector with the
1932     // vector elements.
1933     SmallVector<SDValue, 8> Ops;
1934     if (InVec.getOpcode() == ISD::BUILD_VECTOR) {
1935       Ops.append(InVec.getNode()->op_begin(),
1936                  InVec.getNode()->op_end());
1937     } else if (InVec.getOpcode() == ISD::UNDEF) {
1938       unsigned NElts = VT.getVectorNumElements();
1939       Ops.append(NElts, DAG.getUNDEF(InVal.getValueType()));
1940     } else {
1941       return SDValue();
1942     }
1943
1944     // Insert the element
1945     if (Elt < Ops.size()) {
1946       // All the operands of BUILD_VECTOR must have the same type;
1947       // we enforce that here.
1948       EVT OpVT = Ops[0].getValueType();
1949       if (InVal.getValueType() != OpVT)
1950         InVal = OpVT.bitsGT(InVal.getValueType()) ?
1951           DAG.getNode(ISD::ANY_EXTEND, dl, OpVT, InVal) :
1952           DAG.getNode(ISD::TRUNCATE, dl, OpVT, InVal);
1953       Ops[Elt] = InVal;
1954     }
1955
1956     // Return the new vector
1957     return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
1958   }
1959
1960   // Extract_vec (Build_vector) generated by custom lowering
1961   // also needs to be customly combined
1962   case ISD::EXTRACT_VECTOR_ELT: {
1963     SDValue Arg = N->getOperand(0);
1964     if (Arg.getOpcode() == ISD::BUILD_VECTOR) {
1965       if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
1966         unsigned Element = Const->getZExtValue();
1967         return Arg->getOperand(Element);
1968       }
1969     }
1970     if (Arg.getOpcode() == ISD::BITCAST &&
1971         Arg.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
1972       if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
1973         unsigned Element = Const->getZExtValue();
1974         return DAG.getNode(ISD::BITCAST, SDLoc(N), N->getVTList(),
1975             Arg->getOperand(0).getOperand(Element));
1976       }
1977     }
1978   }
1979
1980   case ISD::SELECT_CC: {
1981     // Try common optimizations
1982     SDValue Ret = AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
1983     if (Ret.getNode())
1984       return Ret;
1985
1986     // fold selectcc (selectcc x, y, a, b, cc), b, a, b, seteq ->
1987     //      selectcc x, y, a, b, inv(cc)
1988     //
1989     // fold selectcc (selectcc x, y, a, b, cc), b, a, b, setne ->
1990     //      selectcc x, y, a, b, cc
1991     SDValue LHS = N->getOperand(0);
1992     if (LHS.getOpcode() != ISD::SELECT_CC) {
1993       return SDValue();
1994     }
1995
1996     SDValue RHS = N->getOperand(1);
1997     SDValue True = N->getOperand(2);
1998     SDValue False = N->getOperand(3);
1999     ISD::CondCode NCC = cast<CondCodeSDNode>(N->getOperand(4))->get();
2000
2001     if (LHS.getOperand(2).getNode() != True.getNode() ||
2002         LHS.getOperand(3).getNode() != False.getNode() ||
2003         RHS.getNode() != False.getNode()) {
2004       return SDValue();
2005     }
2006
2007     switch (NCC) {
2008     default: return SDValue();
2009     case ISD::SETNE: return LHS;
2010     case ISD::SETEQ: {
2011       ISD::CondCode LHSCC = cast<CondCodeSDNode>(LHS.getOperand(4))->get();
2012       LHSCC = ISD::getSetCCInverse(LHSCC,
2013                                   LHS.getOperand(0).getValueType().isInteger());
2014       if (DCI.isBeforeLegalizeOps() ||
2015           isCondCodeLegal(LHSCC, LHS.getOperand(0).getSimpleValueType()))
2016         return DAG.getSelectCC(SDLoc(N),
2017                                LHS.getOperand(0),
2018                                LHS.getOperand(1),
2019                                LHS.getOperand(2),
2020                                LHS.getOperand(3),
2021                                LHSCC);
2022       break;
2023     }
2024     }
2025     return SDValue();
2026   }
2027
2028   case AMDGPUISD::EXPORT: {
2029     SDValue Arg = N->getOperand(1);
2030     if (Arg.getOpcode() != ISD::BUILD_VECTOR)
2031       break;
2032
2033     SDValue NewArgs[8] = {
2034       N->getOperand(0), // Chain
2035       SDValue(),
2036       N->getOperand(2), // ArrayBase
2037       N->getOperand(3), // Type
2038       N->getOperand(4), // SWZ_X
2039       N->getOperand(5), // SWZ_Y
2040       N->getOperand(6), // SWZ_Z
2041       N->getOperand(7) // SWZ_W
2042     };
2043     SDLoc DL(N);
2044     NewArgs[1] = OptimizeSwizzle(N->getOperand(1), &NewArgs[4], DAG);
2045     return DAG.getNode(AMDGPUISD::EXPORT, DL, N->getVTList(), NewArgs);
2046   }
2047   case AMDGPUISD::TEXTURE_FETCH: {
2048     SDValue Arg = N->getOperand(1);
2049     if (Arg.getOpcode() != ISD::BUILD_VECTOR)
2050       break;
2051
2052     SDValue NewArgs[19] = {
2053       N->getOperand(0),
2054       N->getOperand(1),
2055       N->getOperand(2),
2056       N->getOperand(3),
2057       N->getOperand(4),
2058       N->getOperand(5),
2059       N->getOperand(6),
2060       N->getOperand(7),
2061       N->getOperand(8),
2062       N->getOperand(9),
2063       N->getOperand(10),
2064       N->getOperand(11),
2065       N->getOperand(12),
2066       N->getOperand(13),
2067       N->getOperand(14),
2068       N->getOperand(15),
2069       N->getOperand(16),
2070       N->getOperand(17),
2071       N->getOperand(18),
2072     };
2073     NewArgs[1] = OptimizeSwizzle(N->getOperand(1), &NewArgs[2], DAG);
2074     return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, SDLoc(N), N->getVTList(),
2075         NewArgs);
2076   }
2077   }
2078
2079   return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
2080 }
2081
2082 static bool
2083 FoldOperand(SDNode *ParentNode, unsigned SrcIdx, SDValue &Src, SDValue &Neg,
2084             SDValue &Abs, SDValue &Sel, SDValue &Imm, SelectionDAG &DAG) {
2085   const R600InstrInfo *TII =
2086       static_cast<const R600InstrInfo *>(DAG.getSubtarget().getInstrInfo());
2087   if (!Src.isMachineOpcode())
2088     return false;
2089   switch (Src.getMachineOpcode()) {
2090   case AMDGPU::FNEG_R600:
2091     if (!Neg.getNode())
2092       return false;
2093     Src = Src.getOperand(0);
2094     Neg = DAG.getTargetConstant(1, MVT::i32);
2095     return true;
2096   case AMDGPU::FABS_R600:
2097     if (!Abs.getNode())
2098       return false;
2099     Src = Src.getOperand(0);
2100     Abs = DAG.getTargetConstant(1, MVT::i32);
2101     return true;
2102   case AMDGPU::CONST_COPY: {
2103     unsigned Opcode = ParentNode->getMachineOpcode();
2104     bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1;
2105
2106     if (!Sel.getNode())
2107       return false;
2108
2109     SDValue CstOffset = Src.getOperand(0);
2110     if (ParentNode->getValueType(0).isVector())
2111       return false;
2112
2113     // Gather constants values
2114     int SrcIndices[] = {
2115       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0),
2116       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1),
2117       TII->getOperandIdx(Opcode, AMDGPU::OpName::src2),
2118       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_X),
2119       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Y),
2120       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Z),
2121       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_W),
2122       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_X),
2123       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Y),
2124       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Z),
2125       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_W)
2126     };
2127     std::vector<unsigned> Consts;
2128     for (int OtherSrcIdx : SrcIndices) {
2129       int OtherSelIdx = TII->getSelIdx(Opcode, OtherSrcIdx);
2130       if (OtherSrcIdx < 0 || OtherSelIdx < 0)
2131         continue;
2132       if (HasDst) {
2133         OtherSrcIdx--;
2134         OtherSelIdx--;
2135       }
2136       if (RegisterSDNode *Reg =
2137           dyn_cast<RegisterSDNode>(ParentNode->getOperand(OtherSrcIdx))) {
2138         if (Reg->getReg() == AMDGPU::ALU_CONST) {
2139           ConstantSDNode *Cst
2140             = cast<ConstantSDNode>(ParentNode->getOperand(OtherSelIdx));
2141           Consts.push_back(Cst->getZExtValue());
2142         }
2143       }
2144     }
2145
2146     ConstantSDNode *Cst = cast<ConstantSDNode>(CstOffset);
2147     Consts.push_back(Cst->getZExtValue());
2148     if (!TII->fitsConstReadLimitations(Consts)) {
2149       return false;
2150     }
2151
2152     Sel = CstOffset;
2153     Src = DAG.getRegister(AMDGPU::ALU_CONST, MVT::f32);
2154     return true;
2155   }
2156   case AMDGPU::MOV_IMM_I32:
2157   case AMDGPU::MOV_IMM_F32: {
2158     unsigned ImmReg = AMDGPU::ALU_LITERAL_X;
2159     uint64_t ImmValue = 0;
2160
2161
2162     if (Src.getMachineOpcode() == AMDGPU::MOV_IMM_F32) {
2163       ConstantFPSDNode *FPC = dyn_cast<ConstantFPSDNode>(Src.getOperand(0));
2164       float FloatValue = FPC->getValueAPF().convertToFloat();
2165       if (FloatValue == 0.0) {
2166         ImmReg = AMDGPU::ZERO;
2167       } else if (FloatValue == 0.5) {
2168         ImmReg = AMDGPU::HALF;
2169       } else if (FloatValue == 1.0) {
2170         ImmReg = AMDGPU::ONE;
2171       } else {
2172         ImmValue = FPC->getValueAPF().bitcastToAPInt().getZExtValue();
2173       }
2174     } else {
2175       ConstantSDNode *C = dyn_cast<ConstantSDNode>(Src.getOperand(0));
2176       uint64_t Value = C->getZExtValue();
2177       if (Value == 0) {
2178         ImmReg = AMDGPU::ZERO;
2179       } else if (Value == 1) {
2180         ImmReg = AMDGPU::ONE_INT;
2181       } else {
2182         ImmValue = Value;
2183       }
2184     }
2185
2186     // Check that we aren't already using an immediate.
2187     // XXX: It's possible for an instruction to have more than one
2188     // immediate operand, but this is not supported yet.
2189     if (ImmReg == AMDGPU::ALU_LITERAL_X) {
2190       if (!Imm.getNode())
2191         return false;
2192       ConstantSDNode *C = dyn_cast<ConstantSDNode>(Imm);
2193       assert(C);
2194       if (C->getZExtValue())
2195         return false;
2196       Imm = DAG.getTargetConstant(ImmValue, MVT::i32);
2197     }
2198     Src = DAG.getRegister(ImmReg, MVT::i32);
2199     return true;
2200   }
2201   default:
2202     return false;
2203   }
2204 }
2205
2206
2207 /// \brief Fold the instructions after selecting them
2208 SDNode *R600TargetLowering::PostISelFolding(MachineSDNode *Node,
2209                                             SelectionDAG &DAG) const {
2210   const R600InstrInfo *TII =
2211       static_cast<const R600InstrInfo *>(DAG.getSubtarget().getInstrInfo());
2212   if (!Node->isMachineOpcode())
2213     return Node;
2214   unsigned Opcode = Node->getMachineOpcode();
2215   SDValue FakeOp;
2216
2217   std::vector<SDValue> Ops(Node->op_begin(), Node->op_end());
2218
2219   if (Opcode == AMDGPU::DOT_4) {
2220     int OperandIdx[] = {
2221       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_X),
2222       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Y),
2223       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Z),
2224       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_W),
2225       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_X),
2226       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Y),
2227       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Z),
2228       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_W)
2229         };
2230     int NegIdx[] = {
2231       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_X),
2232       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_Y),
2233       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_Z),
2234       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_W),
2235       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_X),
2236       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_Y),
2237       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_Z),
2238       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_W)
2239     };
2240     int AbsIdx[] = {
2241       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_X),
2242       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_Y),
2243       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_Z),
2244       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_W),
2245       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_X),
2246       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_Y),
2247       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_Z),
2248       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_W)
2249     };
2250     for (unsigned i = 0; i < 8; i++) {
2251       if (OperandIdx[i] < 0)
2252         return Node;
2253       SDValue &Src = Ops[OperandIdx[i] - 1];
2254       SDValue &Neg = Ops[NegIdx[i] - 1];
2255       SDValue &Abs = Ops[AbsIdx[i] - 1];
2256       bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1;
2257       int SelIdx = TII->getSelIdx(Opcode, OperandIdx[i]);
2258       if (HasDst)
2259         SelIdx--;
2260       SDValue &Sel = (SelIdx > -1) ? Ops[SelIdx] : FakeOp;
2261       if (FoldOperand(Node, i, Src, Neg, Abs, Sel, FakeOp, DAG))
2262         return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
2263     }
2264   } else if (Opcode == AMDGPU::REG_SEQUENCE) {
2265     for (unsigned i = 1, e = Node->getNumOperands(); i < e; i += 2) {
2266       SDValue &Src = Ops[i];
2267       if (FoldOperand(Node, i, Src, FakeOp, FakeOp, FakeOp, FakeOp, DAG))
2268         return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
2269     }
2270   } else if (Opcode == AMDGPU::CLAMP_R600) {
2271     SDValue Src = Node->getOperand(0);
2272     if (!Src.isMachineOpcode() ||
2273         !TII->hasInstrModifiers(Src.getMachineOpcode()))
2274       return Node;
2275     int ClampIdx = TII->getOperandIdx(Src.getMachineOpcode(),
2276         AMDGPU::OpName::clamp);
2277     if (ClampIdx < 0)
2278       return Node;
2279     std::vector<SDValue> Ops(Src->op_begin(), Src->op_end());
2280     Ops[ClampIdx - 1] = DAG.getTargetConstant(1, MVT::i32);
2281     return DAG.getMachineNode(Src.getMachineOpcode(), SDLoc(Node),
2282         Node->getVTList(), Ops);
2283   } else {
2284     if (!TII->hasInstrModifiers(Opcode))
2285       return Node;
2286     int OperandIdx[] = {
2287       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0),
2288       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1),
2289       TII->getOperandIdx(Opcode, AMDGPU::OpName::src2)
2290     };
2291     int NegIdx[] = {
2292       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg),
2293       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg),
2294       TII->getOperandIdx(Opcode, AMDGPU::OpName::src2_neg)
2295     };
2296     int AbsIdx[] = {
2297       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs),
2298       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs),
2299       -1
2300     };
2301     for (unsigned i = 0; i < 3; i++) {
2302       if (OperandIdx[i] < 0)
2303         return Node;
2304       SDValue &Src = Ops[OperandIdx[i] - 1];
2305       SDValue &Neg = Ops[NegIdx[i] - 1];
2306       SDValue FakeAbs;
2307       SDValue &Abs = (AbsIdx[i] > -1) ? Ops[AbsIdx[i] - 1] : FakeAbs;
2308       bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1;
2309       int SelIdx = TII->getSelIdx(Opcode, OperandIdx[i]);
2310       int ImmIdx = TII->getOperandIdx(Opcode, AMDGPU::OpName::literal);
2311       if (HasDst) {
2312         SelIdx--;
2313         ImmIdx--;
2314       }
2315       SDValue &Sel = (SelIdx > -1) ? Ops[SelIdx] : FakeOp;
2316       SDValue &Imm = Ops[ImmIdx];
2317       if (FoldOperand(Node, i, Src, Neg, Abs, Sel, Imm, DAG))
2318         return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
2319     }
2320   }
2321
2322   return Node;
2323 }