lib/Target/R600/R600ISelLowering.cpp

   1 //===-- R600ISelLowering.cpp - R600 DAG Lowering Implementation -----------===//
   2 //
   3 //                     The LLVM Compiler Infrastructure
   4 //
   5 // This file is distributed under the University of Illinois Open Source
   6 // License. See LICENSE.TXT for details.
   7 //
   8 //===----------------------------------------------------------------------===//
   9 //
  10 /// \file
  11 /// \brief Custom DAG lowering for R600
  12 //
  13 //===----------------------------------------------------------------------===//
  14
  15 #include "R600ISelLowering.h"
  16 #include "AMDGPUFrameLowering.h"
  17 #include "AMDGPUIntrinsicInfo.h"
  18 #include "AMDGPUSubtarget.h"
  19 #include "R600Defines.h"
  20 #include "R600InstrInfo.h"
  21 #include "R600MachineFunctionInfo.h"
  22 #include "llvm/CodeGen/CallingConvLower.h"
  23 #include "llvm/CodeGen/MachineFrameInfo.h"
  24 #include "llvm/CodeGen/MachineInstrBuilder.h"
  25 #include "llvm/CodeGen/MachineRegisterInfo.h"
  26 #include "llvm/CodeGen/SelectionDAG.h"
  27 #include "llvm/IR/Argument.h"
  28 #include "llvm/IR/Function.h"
  29
  30 using namespace llvm;
  31
  32 R600TargetLowering::R600TargetLowering(TargetMachine &TM) :
  33     AMDGPUTargetLowering(TM),
  34     Gen(TM.getSubtarget<AMDGPUSubtarget>().getGeneration()) {
  35   addRegisterClass(MVT::v4f32, &AMDGPU::R600_Reg128RegClass);
  36   addRegisterClass(MVT::f32, &AMDGPU::R600_Reg32RegClass);
  37   addRegisterClass(MVT::v4i32, &AMDGPU::R600_Reg128RegClass);
  38   addRegisterClass(MVT::i32, &AMDGPU::R600_Reg32RegClass);
  39   addRegisterClass(MVT::v2f32, &AMDGPU::R600_Reg64RegClass);
  40   addRegisterClass(MVT::v2i32, &AMDGPU::R600_Reg64RegClass);
  41
  42   computeRegisterProperties();
  43
  44   // Set condition code actions
  45   setCondCodeAction(ISD::SETO,   MVT::f32, Expand);
  46   setCondCodeAction(ISD::SETUO,  MVT::f32, Expand);
  47   setCondCodeAction(ISD::SETLT,  MVT::f32, Expand);
  48   setCondCodeAction(ISD::SETLE,  MVT::f32, Expand);
  49   setCondCodeAction(ISD::SETOLT, MVT::f32, Expand);
  50   setCondCodeAction(ISD::SETOLE, MVT::f32, Expand);
  51   setCondCodeAction(ISD::SETONE, MVT::f32, Expand);
  52   setCondCodeAction(ISD::SETUEQ, MVT::f32, Expand);
  53   setCondCodeAction(ISD::SETUGE, MVT::f32, Expand);
  54   setCondCodeAction(ISD::SETUGT, MVT::f32, Expand);
  55   setCondCodeAction(ISD::SETULT, MVT::f32, Expand);
  56   setCondCodeAction(ISD::SETULE, MVT::f32, Expand);
  57
  58   setCondCodeAction(ISD::SETLE, MVT::i32, Expand);
  59   setCondCodeAction(ISD::SETLT, MVT::i32, Expand);
  60   setCondCodeAction(ISD::SETULE, MVT::i32, Expand);
  61   setCondCodeAction(ISD::SETULT, MVT::i32, Expand);
  62
  63   setOperationAction(ISD::FCOS, MVT::f32, Custom);
  64   setOperationAction(ISD::FSIN, MVT::f32, Custom);
  65
  66   setOperationAction(ISD::SETCC, MVT::v4i32, Expand);
  67   setOperationAction(ISD::SETCC, MVT::v2i32, Expand);
  68
  69   setOperationAction(ISD::BR_CC, MVT::i32, Expand);
  70   setOperationAction(ISD::BR_CC, MVT::f32, Expand);
  71   setOperationAction(ISD::BRCOND, MVT::Other, Custom);
  72
  73   setOperationAction(ISD::FSUB, MVT::f32, Expand);
  74
  75   setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
  76   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
  77   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i1, Custom);
  78
  79   setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
  80   setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
  81
  82   setOperationAction(ISD::SETCC, MVT::i32, Expand);
  83   setOperationAction(ISD::SETCC, MVT::f32, Expand);
  84   setOperationAction(ISD::FP_TO_UINT, MVT::i1, Custom);
  85
  86   setOperationAction(ISD::SELECT, MVT::i32, Expand);
  87   setOperationAction(ISD::SELECT, MVT::f32, Expand);
  88   setOperationAction(ISD::SELECT, MVT::v2i32, Expand);
  89   setOperationAction(ISD::SELECT, MVT::v4i32, Expand);
  90
  91   // Expand sign extension of vectors
  92   if (!Subtarget->hasBFE())
  93     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
  94
  95   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i1, Expand);
  96   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i1, Expand);
  97
  98   if (!Subtarget->hasBFE())
  99     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8, Expand);
 100   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i8, Expand);
 101   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i8, Expand);
 102
 103   if (!Subtarget->hasBFE())
 104     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Expand);
 105   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Expand);
 106   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Expand);
 107
 108   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
 109   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i32, Expand);
 110   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i32, Expand);
 111
 112   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::Other, Expand);
 113
 114
 115   // Legalize loads and stores to the private address space.
 116   setOperationAction(ISD::LOAD, MVT::i32, Custom);
 117   setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
 118   setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
 119
 120   // EXTLOAD should be the same as ZEXTLOAD. It is legal for some address
 121   // spaces, so it is custom lowered to handle those where it isn't.
 122   setLoadExtAction(ISD::SEXTLOAD, MVT::i8, Custom);
 123   setLoadExtAction(ISD::SEXTLOAD, MVT::i16, Custom);
 124   setLoadExtAction(ISD::ZEXTLOAD, MVT::i8, Custom);
 125   setLoadExtAction(ISD::ZEXTLOAD, MVT::i16, Custom);
 126   setLoadExtAction(ISD::EXTLOAD, MVT::i8, Custom);
 127   setLoadExtAction(ISD::EXTLOAD, MVT::i16, Custom);
 128
 129   setOperationAction(ISD::STORE, MVT::i8, Custom);
 130   setOperationAction(ISD::STORE, MVT::i32, Custom);
 131   setOperationAction(ISD::STORE, MVT::v2i32, Custom);
 132   setOperationAction(ISD::STORE, MVT::v4i32, Custom);
 133   setTruncStoreAction(MVT::i32, MVT::i8, Custom);
 134   setTruncStoreAction(MVT::i32, MVT::i16, Custom);
 135
 136   setOperationAction(ISD::LOAD, MVT::i32, Custom);
 137   setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
 138   setOperationAction(ISD::FrameIndex, MVT::i32, Custom);
 139
 140   setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i32, Custom);
 141   setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f32, Custom);
 142   setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom);
 143   setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
 144
 145   setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i32, Custom);
 146   setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f32, Custom);
 147   setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom);
 148   setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom);
 149
 150   setTargetDAGCombine(ISD::FP_ROUND);
 151   setTargetDAGCombine(ISD::FP_TO_SINT);
 152   setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
 153   setTargetDAGCombine(ISD::SELECT_CC);
 154   setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
 155
 156   setOperationAction(ISD::SUB, MVT::i64, Expand);
 157
 158   // These should be replaced by UDVIREM, but it does not happen automatically
 159   // during Type Legalization
 160   setOperationAction(ISD::UDIV, MVT::i64, Custom);
 161   setOperationAction(ISD::UREM, MVT::i64, Custom);
 162   setOperationAction(ISD::SDIV, MVT::i64, Custom);
 163   setOperationAction(ISD::SREM, MVT::i64, Custom);
 164
 165   // We don't have 64-bit shifts. Thus we need either SHX i64 or SHX_PARTS i32
 166   //  to be Legal/Custom in order to avoid library calls.
 167   setOperationAction(ISD::SHL_PARTS, MVT::i32, Custom);
 168   setOperationAction(ISD::SRL_PARTS, MVT::i32, Custom);
 169   setOperationAction(ISD::SRA_PARTS, MVT::i32, Custom);
 170
 171   setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
 172
 173   const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 };
 174   for (MVT VT : ScalarIntVTs) {
 175     setOperationAction(ISD::ADDC, VT, Expand);
 176     setOperationAction(ISD::SUBC, VT, Expand);
 177     setOperationAction(ISD::ADDE, VT, Expand);
 178     setOperationAction(ISD::SUBE, VT, Expand);
 179   }
 180
 181   setBooleanContents(ZeroOrNegativeOneBooleanContent);
 182   setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
 183   setSchedulingPreference(Sched::Source);
 184 }
 185
 186 MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter(
 187     MachineInstr * MI, MachineBasicBlock * BB) const {
 188   MachineFunction * MF = BB->getParent();
 189   MachineRegisterInfo &MRI = MF->getRegInfo();
 190   MachineBasicBlock::iterator I = *MI;
 191   const R600InstrInfo *TII =
 192     static_cast<const R600InstrInfo*>(MF->getTarget().getInstrInfo());
 193
 194   switch (MI->getOpcode()) {
 195   default:
 196     // Replace LDS_*_RET instruction that don't have any uses with the
 197     // equivalent LDS_*_NORET instruction.
 198     if (TII->isLDSRetInstr(MI->getOpcode())) {
 199       int DstIdx = TII->getOperandIdx(MI->getOpcode(), AMDGPU::OpName::dst);
 200       assert(DstIdx != -1);
 201       MachineInstrBuilder NewMI;
 202       if (!MRI.use_empty(MI->getOperand(DstIdx).getReg()))
 203         return BB;
 204
 205       NewMI = BuildMI(*BB, I, BB->findDebugLoc(I),
 206                       TII->get(AMDGPU::getLDSNoRetOp(MI->getOpcode())));
 207       for (unsigned i = 1, e = MI->getNumOperands(); i < e; ++i) {
 208         NewMI.addOperand(MI->getOperand(i));
 209       }
 210     } else {
 211       return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
 212     }
 213     break;
 214   case AMDGPU::CLAMP_R600: {
 215     MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
 216                                                    AMDGPU::MOV,
 217                                                    MI->getOperand(0).getReg(),
 218                                                    MI->getOperand(1).getReg());
 219     TII->addFlag(NewMI, 0, MO_FLAG_CLAMP);
 220     break;
 221   }
 222
 223   case AMDGPU::FABS_R600: {
 224     MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
 225                                                     AMDGPU::MOV,
 226                                                     MI->getOperand(0).getReg(),
 227                                                     MI->getOperand(1).getReg());
 228     TII->addFlag(NewMI, 0, MO_FLAG_ABS);
 229     break;
 230   }
 231
 232   case AMDGPU::FNEG_R600: {
 233     MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
 234                                                     AMDGPU::MOV,
 235                                                     MI->getOperand(0).getReg(),
 236                                                     MI->getOperand(1).getReg());
 237     TII->addFlag(NewMI, 0, MO_FLAG_NEG);
 238     break;
 239   }
 240
 241   case AMDGPU::MASK_WRITE: {
 242     unsigned maskedRegister = MI->getOperand(0).getReg();
 243     assert(TargetRegisterInfo::isVirtualRegister(maskedRegister));
 244     MachineInstr * defInstr = MRI.getVRegDef(maskedRegister);
 245     TII->addFlag(defInstr, 0, MO_FLAG_MASK);
 246     break;
 247   }
 248
 249   case AMDGPU::MOV_IMM_F32:
 250     TII->buildMovImm(*BB, I, MI->getOperand(0).getReg(),
 251                      MI->getOperand(1).getFPImm()->getValueAPF()
 252                          .bitcastToAPInt().getZExtValue());
 253     break;
 254   case AMDGPU::MOV_IMM_I32:
 255     TII->buildMovImm(*BB, I, MI->getOperand(0).getReg(),
 256                      MI->getOperand(1).getImm());
 257     break;
 258   case AMDGPU::CONST_COPY: {
 259     MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, MI, AMDGPU::MOV,
 260         MI->getOperand(0).getReg(), AMDGPU::ALU_CONST);
 261     TII->setImmOperand(NewMI, AMDGPU::OpName::src0_sel,
 262         MI->getOperand(1).getImm());
 263     break;
 264   }
 265
 266   case AMDGPU::RAT_WRITE_CACHELESS_32_eg:
 267   case AMDGPU::RAT_WRITE_CACHELESS_64_eg:
 268   case AMDGPU::RAT_WRITE_CACHELESS_128_eg: {
 269     unsigned EOP = (std::next(I)->getOpcode() == AMDGPU::RETURN) ? 1 : 0;
 270
 271     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode()))
 272             .addOperand(MI->getOperand(0))
 273             .addOperand(MI->getOperand(1))
 274             .addImm(EOP); // Set End of program bit
 275     break;
 276   }
 277
 278   case AMDGPU::TXD: {
 279     unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
 280     unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
 281     MachineOperand &RID = MI->getOperand(4);
 282     MachineOperand &SID = MI->getOperand(5);
 283     unsigned TextureId = MI->getOperand(6).getImm();
 284     unsigned SrcX = 0, SrcY = 1, SrcZ = 2, SrcW = 3;
 285     unsigned CTX = 1, CTY = 1, CTZ = 1, CTW = 1;
 286
 287     switch (TextureId) {
 288     case 5: // Rect
 289       CTX = CTY = 0;
 290       break;
 291     case 6: // Shadow1D
 292       SrcW = SrcZ;
 293       break;
 294     case 7: // Shadow2D
 295       SrcW = SrcZ;
 296       break;
 297     case 8: // ShadowRect
 298       CTX = CTY = 0;
 299       SrcW = SrcZ;
 300       break;
 301     case 9: // 1DArray
 302       SrcZ = SrcY;
 303       CTZ = 0;
 304       break;
 305     case 10: // 2DArray
 306       CTZ = 0;
 307       break;
 308     case 11: // Shadow1DArray
 309       SrcZ = SrcY;
 310       CTZ = 0;
 311       break;
 312     case 12: // Shadow2DArray
 313       CTZ = 0;
 314       break;
 315     }
 316     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), T0)
 317             .addOperand(MI->getOperand(3))
 318             .addImm(SrcX)
 319             .addImm(SrcY)
 320             .addImm(SrcZ)
 321             .addImm(SrcW)
 322             .addImm(0)
 323             .addImm(0)
 324             .addImm(0)
 325             .addImm(0)
 326             .addImm(1)
 327             .addImm(2)
 328             .addImm(3)
 329             .addOperand(RID)
 330             .addOperand(SID)
 331             .addImm(CTX)
 332             .addImm(CTY)
 333             .addImm(CTZ)
 334             .addImm(CTW);
 335     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), T1)
 336             .addOperand(MI->getOperand(2))
 337             .addImm(SrcX)
 338             .addImm(SrcY)
 339             .addImm(SrcZ)
 340             .addImm(SrcW)
 341             .addImm(0)
 342             .addImm(0)
 343             .addImm(0)
 344             .addImm(0)
 345             .addImm(1)
 346             .addImm(2)
 347             .addImm(3)
 348             .addOperand(RID)
 349             .addOperand(SID)
 350             .addImm(CTX)
 351             .addImm(CTY)
 352             .addImm(CTZ)
 353             .addImm(CTW);
 354     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_G))
 355             .addOperand(MI->getOperand(0))
 356             .addOperand(MI->getOperand(1))
 357             .addImm(SrcX)
 358             .addImm(SrcY)
 359             .addImm(SrcZ)
 360             .addImm(SrcW)
 361             .addImm(0)
 362             .addImm(0)
 363             .addImm(0)
 364             .addImm(0)
 365             .addImm(1)
 366             .addImm(2)
 367             .addImm(3)
 368             .addOperand(RID)
 369             .addOperand(SID)
 370             .addImm(CTX)
 371             .addImm(CTY)
 372             .addImm(CTZ)
 373             .addImm(CTW)
 374             .addReg(T0, RegState::Implicit)
 375             .addReg(T1, RegState::Implicit);
 376     break;
 377   }
 378
 379   case AMDGPU::TXD_SHADOW: {
 380     unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
 381     unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
 382     MachineOperand &RID = MI->getOperand(4);
 383     MachineOperand &SID = MI->getOperand(5);
 384     unsigned TextureId = MI->getOperand(6).getImm();
 385     unsigned SrcX = 0, SrcY = 1, SrcZ = 2, SrcW = 3;
 386     unsigned CTX = 1, CTY = 1, CTZ = 1, CTW = 1;
 387
 388     switch (TextureId) {
 389     case 5: // Rect
 390       CTX = CTY = 0;
 391       break;
 392     case 6: // Shadow1D
 393       SrcW = SrcZ;
 394       break;
 395     case 7: // Shadow2D
 396       SrcW = SrcZ;
 397       break;
 398     case 8: // ShadowRect
 399       CTX = CTY = 0;
 400       SrcW = SrcZ;
 401       break;
 402     case 9: // 1DArray
 403       SrcZ = SrcY;
 404       CTZ = 0;
 405       break;
 406     case 10: // 2DArray
 407       CTZ = 0;
 408       break;
 409     case 11: // Shadow1DArray
 410       SrcZ = SrcY;
 411       CTZ = 0;
 412       break;
 413     case 12: // Shadow2DArray
 414       CTZ = 0;
 415       break;
 416     }
 417
 418     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), T0)
 419             .addOperand(MI->getOperand(3))
 420             .addImm(SrcX)
 421             .addImm(SrcY)
 422             .addImm(SrcZ)
 423             .addImm(SrcW)
 424             .addImm(0)
 425             .addImm(0)
 426             .addImm(0)
 427             .addImm(0)
 428             .addImm(1)
 429             .addImm(2)
 430             .addImm(3)
 431             .addOperand(RID)
 432             .addOperand(SID)
 433             .addImm(CTX)
 434             .addImm(CTY)
 435             .addImm(CTZ)
 436             .addImm(CTW);
 437     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), T1)
 438             .addOperand(MI->getOperand(2))
 439             .addImm(SrcX)
 440             .addImm(SrcY)
 441             .addImm(SrcZ)
 442             .addImm(SrcW)
 443             .addImm(0)
 444             .addImm(0)
 445             .addImm(0)
 446             .addImm(0)
 447             .addImm(1)
 448             .addImm(2)
 449             .addImm(3)
 450             .addOperand(RID)
 451             .addOperand(SID)
 452             .addImm(CTX)
 453             .addImm(CTY)
 454             .addImm(CTZ)
 455             .addImm(CTW);
 456     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_C_G))
 457             .addOperand(MI->getOperand(0))
 458             .addOperand(MI->getOperand(1))
 459             .addImm(SrcX)
 460             .addImm(SrcY)
 461             .addImm(SrcZ)
 462             .addImm(SrcW)
 463             .addImm(0)
 464             .addImm(0)
 465             .addImm(0)
 466             .addImm(0)
 467             .addImm(1)
 468             .addImm(2)
 469             .addImm(3)
 470             .addOperand(RID)
 471             .addOperand(SID)
 472             .addImm(CTX)
 473             .addImm(CTY)
 474             .addImm(CTZ)
 475             .addImm(CTW)
 476             .addReg(T0, RegState::Implicit)
 477             .addReg(T1, RegState::Implicit);
 478     break;
 479   }
 480
 481   case AMDGPU::BRANCH:
 482       BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP))
 483               .addOperand(MI->getOperand(0));
 484       break;
 485
 486   case AMDGPU::BRANCH_COND_f32: {
 487     MachineInstr *NewMI =
 488       BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X),
 489               AMDGPU::PREDICATE_BIT)
 490               .addOperand(MI->getOperand(1))
 491               .addImm(OPCODE_IS_NOT_ZERO)
 492               .addImm(0); // Flags
 493     TII->addFlag(NewMI, 0, MO_FLAG_PUSH);
 494     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND))
 495             .addOperand(MI->getOperand(0))
 496             .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
 497     break;
 498   }
 499
 500   case AMDGPU::BRANCH_COND_i32: {
 501     MachineInstr *NewMI =
 502       BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X),
 503             AMDGPU::PREDICATE_BIT)
 504             .addOperand(MI->getOperand(1))
 505             .addImm(OPCODE_IS_NOT_ZERO_INT)
 506             .addImm(0); // Flags
 507     TII->addFlag(NewMI, 0, MO_FLAG_PUSH);
 508     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND))
 509            .addOperand(MI->getOperand(0))
 510             .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
 511     break;
 512   }
 513
 514   case AMDGPU::EG_ExportSwz:
 515   case AMDGPU::R600_ExportSwz: {
 516     // Instruction is left unmodified if its not the last one of its type
 517     bool isLastInstructionOfItsType = true;
 518     unsigned InstExportType = MI->getOperand(1).getImm();
 519     for (MachineBasicBlock::iterator NextExportInst = std::next(I),
 520          EndBlock = BB->end(); NextExportInst != EndBlock;
 521          NextExportInst = std::next(NextExportInst)) {
 522       if (NextExportInst->getOpcode() == AMDGPU::EG_ExportSwz ||
 523           NextExportInst->getOpcode() == AMDGPU::R600_ExportSwz) {
 524         unsigned CurrentInstExportType = NextExportInst->getOperand(1)
 525             .getImm();
 526         if (CurrentInstExportType == InstExportType) {
 527           isLastInstructionOfItsType = false;
 528           break;
 529         }
 530       }
 531     }
 532     bool EOP = (std::next(I)->getOpcode() == AMDGPU::RETURN) ? 1 : 0;
 533     if (!EOP && !isLastInstructionOfItsType)
 534       return BB;
 535     unsigned CfInst = (MI->getOpcode() == AMDGPU::EG_ExportSwz)? 84 : 40;
 536     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode()))
 537             .addOperand(MI->getOperand(0))
 538             .addOperand(MI->getOperand(1))
 539             .addOperand(MI->getOperand(2))
 540             .addOperand(MI->getOperand(3))
 541             .addOperand(MI->getOperand(4))
 542             .addOperand(MI->getOperand(5))
 543             .addOperand(MI->getOperand(6))
 544             .addImm(CfInst)
 545             .addImm(EOP);
 546     break;
 547   }
 548   case AMDGPU::RETURN: {
 549     // RETURN instructions must have the live-out registers as implicit uses,
 550     // otherwise they appear dead.
 551     R600MachineFunctionInfo *MFI = MF->getInfo<R600MachineFunctionInfo>();
 552     MachineInstrBuilder MIB(*MF, MI);
 553     for (unsigned i = 0, e = MFI->LiveOuts.size(); i != e; ++i)
 554       MIB.addReg(MFI->LiveOuts[i], RegState::Implicit);
 555     return BB;
 556   }
 557   }
 558
 559   MI->eraseFromParent();
 560   return BB;
 561 }
 562
 563 //===----------------------------------------------------------------------===//
 564 // Custom DAG Lowering Operations
 565 //===----------------------------------------------------------------------===//
 566
 567 SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
 568   MachineFunction &MF = DAG.getMachineFunction();
 569   R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
 570   switch (Op.getOpcode()) {
 571   default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
 572   case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
 573   case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
 574   case ISD::SHL_PARTS: return LowerSHLParts(Op, DAG);
 575   case ISD::SRA_PARTS:
 576   case ISD::SRL_PARTS: return LowerSRXParts(Op, DAG);
 577   case ISD::FCOS:
 578   case ISD::FSIN: return LowerTrig(Op, DAG);
 579   case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
 580   case ISD::STORE: return LowerSTORE(Op, DAG);
 581   case ISD::LOAD: return LowerLOAD(Op, DAG);
 582   case ISD::BRCOND: return LowerBRCOND(Op, DAG);
 583   case ISD::GlobalAddress: return LowerGlobalAddress(MFI, Op, DAG);
 584   case ISD::INTRINSIC_VOID: {
 585     SDValue Chain = Op.getOperand(0);
 586     unsigned IntrinsicID =
 587                          cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
 588     switch (IntrinsicID) {
 589     case AMDGPUIntrinsic::AMDGPU_store_output: {
 590       int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
 591       unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister(RegIndex);
 592       MFI->LiveOuts.push_back(Reg);
 593       return DAG.getCopyToReg(Chain, SDLoc(Op), Reg, Op.getOperand(2));
 594     }
 595     case AMDGPUIntrinsic::R600_store_swizzle: {
 596       const SDValue Args[8] = {
 597         Chain,
 598         Op.getOperand(2), // Export Value
 599         Op.getOperand(3), // ArrayBase
 600         Op.getOperand(4), // Type
 601         DAG.getConstant(0, MVT::i32), // SWZ_X
 602         DAG.getConstant(1, MVT::i32), // SWZ_Y
 603         DAG.getConstant(2, MVT::i32), // SWZ_Z
 604         DAG.getConstant(3, MVT::i32) // SWZ_W
 605       };
 606       return DAG.getNode(AMDGPUISD::EXPORT, SDLoc(Op), Op.getValueType(), Args);
 607     }
 608
 609     // default for switch(IntrinsicID)
 610     default: break;
 611     }
 612     // break out of case ISD::INTRINSIC_VOID in switch(Op.getOpcode())
 613     break;
 614   }
 615   case ISD::INTRINSIC_WO_CHAIN: {
 616     unsigned IntrinsicID =
 617                          cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
 618     EVT VT = Op.getValueType();
 619     SDLoc DL(Op);
 620     switch(IntrinsicID) {
 621     default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
 622     case AMDGPUIntrinsic::R600_load_input: {
 623       int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
 624       unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister(RegIndex);
 625       MachineFunction &MF = DAG.getMachineFunction();
 626       MachineRegisterInfo &MRI = MF.getRegInfo();
 627       MRI.addLiveIn(Reg);
 628       return DAG.getCopyFromReg(DAG.getEntryNode(),
 629           SDLoc(DAG.getEntryNode()), Reg, VT);
 630     }
 631
 632     case AMDGPUIntrinsic::R600_interp_input: {
 633       int slot = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
 634       int ijb = cast<ConstantSDNode>(Op.getOperand(2))->getSExtValue();
 635       MachineSDNode *interp;
 636       if (ijb < 0) {
 637         const MachineFunction &MF = DAG.getMachineFunction();
 638         const R600InstrInfo *TII =
 639           static_cast<const R600InstrInfo*>(MF.getTarget().getInstrInfo());
 640         interp = DAG.getMachineNode(AMDGPU::INTERP_VEC_LOAD, DL,
 641             MVT::v4f32, DAG.getTargetConstant(slot / 4 , MVT::i32));
 642         return DAG.getTargetExtractSubreg(
 643             TII->getRegisterInfo().getSubRegFromChannel(slot % 4),
 644             DL, MVT::f32, SDValue(interp, 0));
 645       }
 646       MachineFunction &MF = DAG.getMachineFunction();
 647       MachineRegisterInfo &MRI = MF.getRegInfo();
 648       unsigned RegisterI = AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb);
 649       unsigned RegisterJ = AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb + 1);
 650       MRI.addLiveIn(RegisterI);
 651       MRI.addLiveIn(RegisterJ);
 652       SDValue RegisterINode = DAG.getCopyFromReg(DAG.getEntryNode(),
 653           SDLoc(DAG.getEntryNode()), RegisterI, MVT::f32);
 654       SDValue RegisterJNode = DAG.getCopyFromReg(DAG.getEntryNode(),
 655           SDLoc(DAG.getEntryNode()), RegisterJ, MVT::f32);
 656
 657       if (slot % 4 < 2)
 658         interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_XY, DL,
 659             MVT::f32, MVT::f32, DAG.getTargetConstant(slot / 4 , MVT::i32),
 660             RegisterJNode, RegisterINode);
 661       else
 662         interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_ZW, DL,
 663             MVT::f32, MVT::f32, DAG.getTargetConstant(slot / 4 , MVT::i32),
 664             RegisterJNode, RegisterINode);
 665       return SDValue(interp, slot % 2);
 666     }
 667     case AMDGPUIntrinsic::R600_interp_xy:
 668     case AMDGPUIntrinsic::R600_interp_zw: {
 669       int slot = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
 670       MachineSDNode *interp;
 671       SDValue RegisterINode = Op.getOperand(2);
 672       SDValue RegisterJNode = Op.getOperand(3);
 673
 674       if (IntrinsicID == AMDGPUIntrinsic::R600_interp_xy)
 675         interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_XY, DL,
 676             MVT::f32, MVT::f32, DAG.getTargetConstant(slot, MVT::i32),
 677             RegisterJNode, RegisterINode);
 678       else
 679         interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_ZW, DL,
 680             MVT::f32, MVT::f32, DAG.getTargetConstant(slot, MVT::i32),
 681             RegisterJNode, RegisterINode);
 682       return DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2f32,
 683           SDValue(interp, 0), SDValue(interp, 1));
 684     }
 685     case AMDGPUIntrinsic::R600_tex:
 686     case AMDGPUIntrinsic::R600_texc:
 687     case AMDGPUIntrinsic::R600_txl:
 688     case AMDGPUIntrinsic::R600_txlc:
 689     case AMDGPUIntrinsic::R600_txb:
 690     case AMDGPUIntrinsic::R600_txbc:
 691     case AMDGPUIntrinsic::R600_txf:
 692     case AMDGPUIntrinsic::R600_txq:
 693     case AMDGPUIntrinsic::R600_ddx:
 694     case AMDGPUIntrinsic::R600_ddy:
 695     case AMDGPUIntrinsic::R600_ldptr: {
 696       unsigned TextureOp;
 697       switch (IntrinsicID) {
 698       case AMDGPUIntrinsic::R600_tex:
 699         TextureOp = 0;
 700         break;
 701       case AMDGPUIntrinsic::R600_texc:
 702         TextureOp = 1;
 703         break;
 704       case AMDGPUIntrinsic::R600_txl:
 705         TextureOp = 2;
 706         break;
 707       case AMDGPUIntrinsic::R600_txlc:
 708         TextureOp = 3;
 709         break;
 710       case AMDGPUIntrinsic::R600_txb:
 711         TextureOp = 4;
 712         break;
 713       case AMDGPUIntrinsic::R600_txbc:
 714         TextureOp = 5;
 715         break;
 716       case AMDGPUIntrinsic::R600_txf:
 717         TextureOp = 6;
 718         break;
 719       case AMDGPUIntrinsic::R600_txq:
 720         TextureOp = 7;
 721         break;
 722       case AMDGPUIntrinsic::R600_ddx:
 723         TextureOp = 8;
 724         break;
 725       case AMDGPUIntrinsic::R600_ddy:
 726         TextureOp = 9;
 727         break;
 728       case AMDGPUIntrinsic::R600_ldptr:
 729         TextureOp = 10;
 730         break;
 731       default:
 732         llvm_unreachable("Unknow Texture Operation");
 733       }
 734
 735       SDValue TexArgs[19] = {
 736         DAG.getConstant(TextureOp, MVT::i32),
 737         Op.getOperand(1),
 738         DAG.getConstant(0, MVT::i32),
 739         DAG.getConstant(1, MVT::i32),
 740         DAG.getConstant(2, MVT::i32),
 741         DAG.getConstant(3, MVT::i32),
 742         Op.getOperand(2),
 743         Op.getOperand(3),
 744         Op.getOperand(4),
 745         DAG.getConstant(0, MVT::i32),
 746         DAG.getConstant(1, MVT::i32),
 747         DAG.getConstant(2, MVT::i32),
 748         DAG.getConstant(3, MVT::i32),
 749         Op.getOperand(5),
 750         Op.getOperand(6),
 751         Op.getOperand(7),
 752         Op.getOperand(8),
 753         Op.getOperand(9),
 754         Op.getOperand(10)
 755       };
 756       return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, DL, MVT::v4f32, TexArgs);
 757     }
 758     case AMDGPUIntrinsic::AMDGPU_dp4: {
 759       SDValue Args[8] = {
 760       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
 761           DAG.getConstant(0, MVT::i32)),
 762       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
 763           DAG.getConstant(0, MVT::i32)),
 764       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
 765           DAG.getConstant(1, MVT::i32)),
 766       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
 767           DAG.getConstant(1, MVT::i32)),
 768       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
 769           DAG.getConstant(2, MVT::i32)),
 770       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
 771           DAG.getConstant(2, MVT::i32)),
 772       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
 773           DAG.getConstant(3, MVT::i32)),
 774       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
 775           DAG.getConstant(3, MVT::i32))
 776       };
 777       return DAG.getNode(AMDGPUISD::DOT4, DL, MVT::f32, Args);
 778     }
 779
 780     case Intrinsic::r600_read_ngroups_x:
 781       return LowerImplicitParameter(DAG, VT, DL, 0);
 782     case Intrinsic::r600_read_ngroups_y:
 783       return LowerImplicitParameter(DAG, VT, DL, 1);
 784     case Intrinsic::r600_read_ngroups_z:
 785       return LowerImplicitParameter(DAG, VT, DL, 2);
 786     case Intrinsic::r600_read_global_size_x:
 787       return LowerImplicitParameter(DAG, VT, DL, 3);
 788     case Intrinsic::r600_read_global_size_y:
 789       return LowerImplicitParameter(DAG, VT, DL, 4);
 790     case Intrinsic::r600_read_global_size_z:
 791       return LowerImplicitParameter(DAG, VT, DL, 5);
 792     case Intrinsic::r600_read_local_size_x:
 793       return LowerImplicitParameter(DAG, VT, DL, 6);
 794     case Intrinsic::r600_read_local_size_y:
 795       return LowerImplicitParameter(DAG, VT, DL, 7);
 796     case Intrinsic::r600_read_local_size_z:
 797       return LowerImplicitParameter(DAG, VT, DL, 8);
 798
 799     case Intrinsic::r600_read_tgid_x:
 800       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 801                                   AMDGPU::T1_X, VT);
 802     case Intrinsic::r600_read_tgid_y:
 803       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 804                                   AMDGPU::T1_Y, VT);
 805     case Intrinsic::r600_read_tgid_z:
 806       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 807                                   AMDGPU::T1_Z, VT);
 808     case Intrinsic::r600_read_tidig_x:
 809       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 810                                   AMDGPU::T0_X, VT);
 811     case Intrinsic::r600_read_tidig_y:
 812       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 813                                   AMDGPU::T0_Y, VT);
 814     case Intrinsic::r600_read_tidig_z:
 815       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 816                                   AMDGPU::T0_Z, VT);
 817     case Intrinsic::AMDGPU_rsq:
 818       // XXX - I'm assuming SI's RSQ_LEGACY matches R600's behavior.
 819       return DAG.getNode(AMDGPUISD::RSQ_LEGACY, DL, VT, Op.getOperand(1));
 820     }
 821     // break out of case ISD::INTRINSIC_WO_CHAIN in switch(Op.getOpcode())
 822     break;
 823   }
 824   } // end switch(Op.getOpcode())
 825   return SDValue();
 826 }
 827
 828 void R600TargetLowering::ReplaceNodeResults(SDNode *N,
 829                                             SmallVectorImpl<SDValue> &Results,
 830                                             SelectionDAG &DAG) const {
 831   switch (N->getOpcode()) {
 832   default:
 833     AMDGPUTargetLowering::ReplaceNodeResults(N, Results, DAG);
 834     return;
 835   case ISD::FP_TO_UINT: Results.push_back(LowerFPTOUINT(N->getOperand(0), DAG));
 836     return;
 837   case ISD::LOAD: {
 838     SDNode *Node = LowerLOAD(SDValue(N, 0), DAG).getNode();
 839     Results.push_back(SDValue(Node, 0));
 840     Results.push_back(SDValue(Node, 1));
 841     // XXX: LLVM seems not to replace Chain Value inside CustomWidenLowerNode
 842     // function
 843     DAG.ReplaceAllUsesOfValueWith(SDValue(N,1), SDValue(Node, 1));
 844     return;
 845   }
 846   case ISD::STORE: {
 847     SDNode *Node = LowerSTORE(SDValue(N, 0), DAG).getNode();
 848     Results.push_back(SDValue(Node, 0));
 849     return;
 850   }
 851   case ISD::UDIV: {
 852     SDValue Op = SDValue(N, 0);
 853     SDLoc DL(Op);
 854     EVT VT = Op.getValueType();
 855     SDValue UDIVREM = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(VT, VT),
 856       N->getOperand(0), N->getOperand(1));
 857     Results.push_back(UDIVREM);
 858     break;
 859   }
 860   case ISD::UREM: {
 861     SDValue Op = SDValue(N, 0);
 862     SDLoc DL(Op);
 863     EVT VT = Op.getValueType();
 864     SDValue UDIVREM = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(VT, VT),
 865       N->getOperand(0), N->getOperand(1));
 866     Results.push_back(UDIVREM.getValue(1));
 867     break;
 868   }
 869   case ISD::SDIV: {
 870     SDValue Op = SDValue(N, 0);
 871     SDLoc DL(Op);
 872     EVT VT = Op.getValueType();
 873     SDValue SDIVREM = DAG.getNode(ISD::SDIVREM, DL, DAG.getVTList(VT, VT),
 874       N->getOperand(0), N->getOperand(1));
 875     Results.push_back(SDIVREM);
 876     break;
 877   }
 878   case ISD::SREM: {
 879     SDValue Op = SDValue(N, 0);
 880     SDLoc DL(Op);
 881     EVT VT = Op.getValueType();
 882     SDValue SDIVREM = DAG.getNode(ISD::SDIVREM, DL, DAG.getVTList(VT, VT),
 883       N->getOperand(0), N->getOperand(1));
 884     Results.push_back(SDIVREM.getValue(1));
 885     break;
 886   }
 887   case ISD::SDIVREM: {
 888     SDValue Op = SDValue(N, 1);
 889     SDValue RES = LowerSDIVREM(Op, DAG);
 890     Results.push_back(RES);
 891     Results.push_back(RES.getValue(1));
 892     break;
 893   }
 894   case ISD::UDIVREM: {
 895     SDValue Op = SDValue(N, 0);
 896     SDLoc DL(Op);
 897     EVT VT = Op.getValueType();
 898     EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());
 899
 900     SDValue one = DAG.getConstant(1, HalfVT);
 901     SDValue zero = DAG.getConstant(0, HalfVT);
 902
 903     //HiLo split
 904     SDValue LHS = N->getOperand(0);
 905     SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, zero);
 906     SDValue LHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, one);
 907
 908     SDValue RHS = N->getOperand(1);
 909     SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, zero);
 910     SDValue RHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, one);
 911
 912     // Get Speculative values
 913     SDValue DIV_Part = DAG.getNode(ISD::UDIV, DL, HalfVT, LHS_Hi, RHS_Lo);
 914     SDValue REM_Part = DAG.getNode(ISD::UREM, DL, HalfVT, LHS_Hi, RHS_Lo);
 915
 916     SDValue REM_Hi = zero;
 917     SDValue REM_Lo = DAG.getSelectCC(DL, RHS_Hi, zero, REM_Part, LHS_Hi, ISD::SETEQ);
 918
 919     SDValue DIV_Hi = DAG.getSelectCC(DL, RHS_Hi, zero, DIV_Part, zero, ISD::SETEQ);
 920     SDValue DIV_Lo = zero;
 921
 922     const unsigned halfBitWidth = HalfVT.getSizeInBits();
 923
 924     for (unsigned i = 0; i < halfBitWidth; ++i) {
 925       SDValue POS = DAG.getConstant(halfBitWidth - i - 1, HalfVT);
 926       // Get Value of high bit
 927       SDValue HBit;
 928       if (halfBitWidth == 32 && Subtarget->hasBFE()) {
 929         HBit = DAG.getNode(AMDGPUISD::BFE_U32, DL, HalfVT, LHS_Lo, POS, one);
 930       } else {
 931         HBit = DAG.getNode(ISD::SRL, DL, HalfVT, LHS_Lo, POS);
 932         HBit = DAG.getNode(ISD::AND, DL, HalfVT, HBit, one);
 933       }
 934
 935       SDValue Carry = DAG.getNode(ISD::SRL, DL, HalfVT, REM_Lo,
 936         DAG.getConstant(halfBitWidth - 1, HalfVT));
 937       REM_Hi = DAG.getNode(ISD::SHL, DL, HalfVT, REM_Hi, one);
 938       REM_Hi = DAG.getNode(ISD::OR, DL, HalfVT, REM_Hi, Carry);
 939
 940       REM_Lo = DAG.getNode(ISD::SHL, DL, HalfVT, REM_Lo, one);
 941       REM_Lo = DAG.getNode(ISD::OR, DL, HalfVT, REM_Lo, HBit);
 942
 943
 944       SDValue REM = DAG.getNode(ISD::BUILD_PAIR, DL, VT, REM_Lo, REM_Hi);
 945
 946       SDValue BIT = DAG.getConstant(1 << (halfBitWidth - i - 1), HalfVT);
 947       SDValue realBIT = DAG.getSelectCC(DL, REM, RHS, BIT, zero, ISD::SETGE);
 948
 949       DIV_Lo = DAG.getNode(ISD::OR, DL, HalfVT, DIV_Lo, realBIT);
 950
 951       // Update REM
 952
 953       SDValue REM_sub = DAG.getNode(ISD::SUB, DL, VT, REM, RHS);
 954
 955       REM = DAG.getSelectCC(DL, REM, RHS, REM_sub, REM, ISD::SETGE);
 956       REM_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, REM, zero);
 957       REM_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, REM, one);
 958     }
 959
 960     SDValue REM = DAG.getNode(ISD::BUILD_PAIR, DL, VT, REM_Lo, REM_Hi);
 961     SDValue DIV = DAG.getNode(ISD::BUILD_PAIR, DL, VT, DIV_Lo, DIV_Hi);
 962     Results.push_back(DIV);
 963     Results.push_back(REM);
 964     break;
 965   }
 966   }
 967 }
 968
 969 SDValue R600TargetLowering::vectorToVerticalVector(SelectionDAG &DAG,
 970                                                    SDValue Vector) const {
 971
 972   SDLoc DL(Vector);
 973   EVT VecVT = Vector.getValueType();
 974   EVT EltVT = VecVT.getVectorElementType();
 975   SmallVector<SDValue, 8> Args;
 976
 977   for (unsigned i = 0, e = VecVT.getVectorNumElements();
 978                                                            i != e; ++i) {
 979     Args.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT,
 980                                Vector, DAG.getConstant(i, getVectorIdxTy())));
 981   }
 982
 983   return DAG.getNode(AMDGPUISD::BUILD_VERTICAL_VECTOR, DL, VecVT, Args);
 984 }
 985
 986 SDValue R600TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
 987                                                     SelectionDAG &DAG) const {
 988
 989   SDLoc DL(Op);
 990   SDValue Vector = Op.getOperand(0);
 991   SDValue Index = Op.getOperand(1);
 992
 993   if (isa<ConstantSDNode>(Index) ||
 994       Vector.getOpcode() == AMDGPUISD::BUILD_VERTICAL_VECTOR)
 995     return Op;
 996
 997   Vector = vectorToVerticalVector(DAG, Vector);
 998   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Op.getValueType(),
 999                      Vector, Index);
1000 }
1001
1002 SDValue R600TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
1003                                                    SelectionDAG &DAG) const {
1004   SDLoc DL(Op);
1005   SDValue Vector = Op.getOperand(0);
1006   SDValue Value = Op.getOperand(1);
1007   SDValue Index = Op.getOperand(2);
1008
1009   if (isa<ConstantSDNode>(Index) ||
1010       Vector.getOpcode() == AMDGPUISD::BUILD_VERTICAL_VECTOR)
1011     return Op;
1012
1013   Vector = vectorToVerticalVector(DAG, Vector);
1014   SDValue Insert = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, Op.getValueType(),
1015                                Vector, Value, Index);
1016   return vectorToVerticalVector(DAG, Insert);
1017 }
1018
1019 SDValue R600TargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
1020   // On hw >= R700, COS/SIN input must be between -1. and 1.
1021   // Thus we lower them to TRIG ( FRACT ( x / 2Pi + 0.5) - 0.5)
1022   EVT VT = Op.getValueType();
1023   SDValue Arg = Op.getOperand(0);
1024   SDValue FractPart = DAG.getNode(AMDGPUISD::FRACT, SDLoc(Op), VT,
1025       DAG.getNode(ISD::FADD, SDLoc(Op), VT,
1026         DAG.getNode(ISD::FMUL, SDLoc(Op), VT, Arg,
1027           DAG.getConstantFP(0.15915494309, MVT::f32)),
1028         DAG.getConstantFP(0.5, MVT::f32)));
1029   unsigned TrigNode;
1030   switch (Op.getOpcode()) {
1031   case ISD::FCOS:
1032     TrigNode = AMDGPUISD::COS_HW;
1033     break;
1034   case ISD::FSIN:
1035     TrigNode = AMDGPUISD::SIN_HW;
1036     break;
1037   default:
1038     llvm_unreachable("Wrong trig opcode");
1039   }
1040   SDValue TrigVal = DAG.getNode(TrigNode, SDLoc(Op), VT,
1041       DAG.getNode(ISD::FADD, SDLoc(Op), VT, FractPart,
1042         DAG.getConstantFP(-0.5, MVT::f32)));
1043   if (Gen >= AMDGPUSubtarget::R700)
1044     return TrigVal;
1045   // On R600 hw, COS/SIN input must be between -Pi and Pi.
1046   return DAG.getNode(ISD::FMUL, SDLoc(Op), VT, TrigVal,
1047       DAG.getConstantFP(3.14159265359, MVT::f32));
1048 }
1049
1050 SDValue R600TargetLowering::LowerSHLParts(SDValue Op, SelectionDAG &DAG) const {
1051   SDLoc DL(Op);
1052   EVT VT = Op.getValueType();
1053
1054   SDValue Lo = Op.getOperand(0);
1055   SDValue Hi = Op.getOperand(1);
1056   SDValue Shift = Op.getOperand(2);
1057   SDValue Zero = DAG.getConstant(0, VT);
1058   SDValue One  = DAG.getConstant(1, VT);
1059
1060   SDValue Width  = DAG.getConstant(VT.getSizeInBits(), VT);
1061   SDValue Width1 = DAG.getConstant(VT.getSizeInBits() - 1, VT);
1062   SDValue BigShift  = DAG.getNode(ISD::SUB, DL, VT, Shift, Width);
1063   SDValue CompShift = DAG.getNode(ISD::SUB, DL, VT, Width1, Shift);
1064
1065   // The dance around Width1 is necessary for 0 special case.
1066   // Without it the CompShift might be 32, producing incorrect results in
1067   // Overflow. So we do the shift in two steps, the alternative is to
1068   // add a conditional to filter the special case.
1069
1070   SDValue Overflow = DAG.getNode(ISD::SRL, DL, VT, Lo, CompShift);
1071   Overflow = DAG.getNode(ISD::SRL, DL, VT, Overflow, One);
1072
1073   SDValue HiSmall = DAG.getNode(ISD::SHL, DL, VT, Hi, Shift);
1074   HiSmall = DAG.getNode(ISD::OR, DL, VT, HiSmall, Overflow);
1075   SDValue LoSmall = DAG.getNode(ISD::SHL, DL, VT, Lo, Shift);
1076
1077   SDValue HiBig = DAG.getNode(ISD::SHL, DL, VT, Lo, BigShift);
1078   SDValue LoBig = Zero;
1079
1080   Hi = DAG.getSelectCC(DL, Shift, Width, HiSmall, HiBig, ISD::SETULT);
1081   Lo = DAG.getSelectCC(DL, Shift, Width, LoSmall, LoBig, ISD::SETULT);
1082
1083   return DAG.getNode(ISD::MERGE_VALUES, DL, DAG.getVTList(VT,VT), Lo, Hi);
1084 }
1085
1086 SDValue R600TargetLowering::LowerSRXParts(SDValue Op, SelectionDAG &DAG) const {
1087   SDLoc DL(Op);
1088   EVT VT = Op.getValueType();
1089
1090   SDValue Lo = Op.getOperand(0);
1091   SDValue Hi = Op.getOperand(1);
1092   SDValue Shift = Op.getOperand(2);
1093   SDValue Zero = DAG.getConstant(0, VT);
1094   SDValue One  = DAG.getConstant(1, VT);
1095
1096   const bool SRA = Op.getOpcode() == ISD::SRA_PARTS;
1097
1098   SDValue Width  = DAG.getConstant(VT.getSizeInBits(), VT);
1099   SDValue Width1 = DAG.getConstant(VT.getSizeInBits() - 1, VT);
1100   SDValue BigShift  = DAG.getNode(ISD::SUB, DL, VT, Shift, Width);
1101   SDValue CompShift = DAG.getNode(ISD::SUB, DL, VT, Width1, Shift);
1102
1103   // The dance around Width1 is necessary for 0 special case.
1104   // Without it the CompShift might be 32, producing incorrect results in
1105   // Overflow. So we do the shift in two steps, the alternative is to
1106   // add a conditional to filter the special case.
1107
1108   SDValue Overflow = DAG.getNode(ISD::SHL, DL, VT, Hi, CompShift);
1109   Overflow = DAG.getNode(ISD::SHL, DL, VT, Overflow, One);
1110
1111   SDValue HiSmall = DAG.getNode(SRA ? ISD::SRA : ISD::SRL, DL, VT, Hi, Shift);
1112   SDValue LoSmall = DAG.getNode(ISD::SRL, DL, VT, Lo, Shift);
1113   LoSmall = DAG.getNode(ISD::OR, DL, VT, LoSmall, Overflow);
1114
1115   SDValue LoBig = DAG.getNode(SRA ? ISD::SRA : ISD::SRL, DL, VT, Hi, BigShift);
1116   SDValue HiBig = SRA ? DAG.getNode(ISD::SRA, DL, VT, Hi, Width1) : Zero;
1117
1118   Hi = DAG.getSelectCC(DL, Shift, Width, HiSmall, HiBig, ISD::SETULT);
1119   Lo = DAG.getSelectCC(DL, Shift, Width, LoSmall, LoBig, ISD::SETULT);
1120
1121   return DAG.getNode(ISD::MERGE_VALUES, DL, DAG.getVTList(VT,VT), Lo, Hi);
1122 }
1123
1124 SDValue R600TargetLowering::LowerFPTOUINT(SDValue Op, SelectionDAG &DAG) const {
1125   return DAG.getNode(
1126       ISD::SETCC,
1127       SDLoc(Op),
1128       MVT::i1,
1129       Op, DAG.getConstantFP(0.0f, MVT::f32),
1130       DAG.getCondCode(ISD::SETNE)
1131       );
1132 }
1133
1134 SDValue R600TargetLowering::LowerImplicitParameter(SelectionDAG &DAG, EVT VT,
1135                                                    SDLoc DL,
1136                                                    unsigned DwordOffset) const {
1137   unsigned ByteOffset = DwordOffset * 4;
1138   PointerType * PtrType = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
1139                                       AMDGPUAS::CONSTANT_BUFFER_0);
1140
1141   // We shouldn't be using an offset wider than 16-bits for implicit parameters.
1142   assert(isInt<16>(ByteOffset));
1143
1144   return DAG.getLoad(VT, DL, DAG.getEntryNode(),
1145                      DAG.getConstant(ByteOffset, MVT::i32), // PTR
1146                      MachinePointerInfo(ConstantPointerNull::get(PtrType)),
1147                      false, false, false, 0);
1148 }
1149
1150 bool R600TargetLowering::isZero(SDValue Op) const {
1151   if(ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
1152     return Cst->isNullValue();
1153   } else if(ConstantFPSDNode *CstFP = dyn_cast<ConstantFPSDNode>(Op)){
1154     return CstFP->isZero();
1155   } else {
1156     return false;
1157   }
1158 }
1159
1160 SDValue R600TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
1161   SDLoc DL(Op);
1162   EVT VT = Op.getValueType();
1163
1164   SDValue LHS = Op.getOperand(0);
1165   SDValue RHS = Op.getOperand(1);
1166   SDValue True = Op.getOperand(2);
1167   SDValue False = Op.getOperand(3);
1168   SDValue CC = Op.getOperand(4);
1169   SDValue Temp;
1170
1171   // LHS and RHS are guaranteed to be the same value type
1172   EVT CompareVT = LHS.getValueType();
1173
1174   // Check if we can lower this to a native operation.
1175
1176   // Try to lower to a SET* instruction:
1177   //
1178   // SET* can match the following patterns:
1179   //
1180   // select_cc f32, f32, -1,  0, cc_supported
1181   // select_cc f32, f32, 1.0f, 0.0f, cc_supported
1182   // select_cc i32, i32, -1,  0, cc_supported
1183   //
1184
1185   // Move hardware True/False values to the correct operand.
1186   ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
1187   ISD::CondCode InverseCC =
1188      ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32);
1189   if (isHWTrueValue(False) && isHWFalseValue(True)) {
1190     if (isCondCodeLegal(InverseCC, CompareVT.getSimpleVT())) {
1191       std::swap(False, True);
1192       CC = DAG.getCondCode(InverseCC);
1193     } else {
1194       ISD::CondCode SwapInvCC = ISD::getSetCCSwappedOperands(InverseCC);
1195       if (isCondCodeLegal(SwapInvCC, CompareVT.getSimpleVT())) {
1196         std::swap(False, True);
1197         std::swap(LHS, RHS);
1198         CC = DAG.getCondCode(SwapInvCC);
1199       }
1200     }
1201   }
1202
1203   if (isHWTrueValue(True) && isHWFalseValue(False) &&
1204       (CompareVT == VT || VT == MVT::i32)) {
1205     // This can be matched by a SET* instruction.
1206     return DAG.getNode(ISD::SELECT_CC, DL, VT, LHS, RHS, True, False, CC);
1207   }
1208
1209   // Try to lower to a CND* instruction:
1210   //
1211   // CND* can match the following patterns:
1212   //
1213   // select_cc f32, 0.0, f32, f32, cc_supported
1214   // select_cc f32, 0.0, i32, i32, cc_supported
1215   // select_cc i32, 0,   f32, f32, cc_supported
1216   // select_cc i32, 0,   i32, i32, cc_supported
1217   //
1218
1219   // Try to move the zero value to the RHS
1220   if (isZero(LHS)) {
1221     ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
1222     // Try swapping the operands
1223     ISD::CondCode CCSwapped = ISD::getSetCCSwappedOperands(CCOpcode);
1224     if (isCondCodeLegal(CCSwapped, CompareVT.getSimpleVT())) {
1225       std::swap(LHS, RHS);
1226       CC = DAG.getCondCode(CCSwapped);
1227     } else {
1228       // Try inverting the conditon and then swapping the operands
1229       ISD::CondCode CCInv = ISD::getSetCCInverse(CCOpcode, CompareVT.isInteger());
1230       CCSwapped = ISD::getSetCCSwappedOperands(CCInv);
1231       if (isCondCodeLegal(CCSwapped, CompareVT.getSimpleVT())) {
1232         std::swap(True, False);
1233         std::swap(LHS, RHS);
1234         CC = DAG.getCondCode(CCSwapped);
1235       }
1236     }
1237   }
1238   if (isZero(RHS)) {
1239     SDValue Cond = LHS;
1240     SDValue Zero = RHS;
1241     ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
1242     if (CompareVT != VT) {
1243       // Bitcast True / False to the correct types.  This will end up being
1244       // a nop, but it allows us to define only a single pattern in the
1245       // .TD files for each CND* instruction rather than having to have
1246       // one pattern for integer True/False and one for fp True/False
1247       True = DAG.getNode(ISD::BITCAST, DL, CompareVT, True);
1248       False = DAG.getNode(ISD::BITCAST, DL, CompareVT, False);
1249     }
1250
1251     switch (CCOpcode) {
1252     case ISD::SETONE:
1253     case ISD::SETUNE:
1254     case ISD::SETNE:
1255       CCOpcode = ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32);
1256       Temp = True;
1257       True = False;
1258       False = Temp;
1259       break;
1260     default:
1261       break;
1262     }
1263     SDValue SelectNode = DAG.getNode(ISD::SELECT_CC, DL, CompareVT,
1264         Cond, Zero,
1265         True, False,
1266         DAG.getCondCode(CCOpcode));
1267     return DAG.getNode(ISD::BITCAST, DL, VT, SelectNode);
1268   }
1269
1270   // If we make it this for it means we have no native instructions to handle
1271   // this SELECT_CC, so we must lower it.
1272   SDValue HWTrue, HWFalse;
1273
1274   if (CompareVT == MVT::f32) {
1275     HWTrue = DAG.getConstantFP(1.0f, CompareVT);
1276     HWFalse = DAG.getConstantFP(0.0f, CompareVT);
1277   } else if (CompareVT == MVT::i32) {
1278     HWTrue = DAG.getConstant(-1, CompareVT);
1279     HWFalse = DAG.getConstant(0, CompareVT);
1280   }
1281   else {
1282     llvm_unreachable("Unhandled value type in LowerSELECT_CC");
1283   }
1284
1285   // Lower this unsupported SELECT_CC into a combination of two supported
1286   // SELECT_CC operations.
1287   SDValue Cond = DAG.getNode(ISD::SELECT_CC, DL, CompareVT, LHS, RHS, HWTrue, HWFalse, CC);
1288
1289   return DAG.getNode(ISD::SELECT_CC, DL, VT,
1290       Cond, HWFalse,
1291       True, False,
1292       DAG.getCondCode(ISD::SETNE));
1293 }
1294
1295 /// LLVM generates byte-addressed pointers.  For indirect addressing, we need to
1296 /// convert these pointers to a register index.  Each register holds
1297 /// 16 bytes, (4 x 32bit sub-register), but we need to take into account the
1298 /// \p StackWidth, which tells us how many of the 4 sub-registrers will be used
1299 /// for indirect addressing.
1300 SDValue R600TargetLowering::stackPtrToRegIndex(SDValue Ptr,
1301                                                unsigned StackWidth,
1302                                                SelectionDAG &DAG) const {
1303   unsigned SRLPad;
1304   switch(StackWidth) {
1305   case 1:
1306     SRLPad = 2;
1307     break;
1308   case 2:
1309     SRLPad = 3;
1310     break;
1311   case 4:
1312     SRLPad = 4;
1313     break;
1314   default: llvm_unreachable("Invalid stack width");
1315   }
1316
1317   return DAG.getNode(ISD::SRL, SDLoc(Ptr), Ptr.getValueType(), Ptr,
1318                      DAG.getConstant(SRLPad, MVT::i32));
1319 }
1320
1321 void R600TargetLowering::getStackAddress(unsigned StackWidth,
1322                                          unsigned ElemIdx,
1323                                          unsigned &Channel,
1324                                          unsigned &PtrIncr) const {
1325   switch (StackWidth) {
1326   default:
1327   case 1:
1328     Channel = 0;
1329     if (ElemIdx > 0) {
1330       PtrIncr = 1;
1331     } else {
1332       PtrIncr = 0;
1333     }
1334     break;
1335   case 2:
1336     Channel = ElemIdx % 2;
1337     if (ElemIdx == 2) {
1338       PtrIncr = 1;
1339     } else {
1340       PtrIncr = 0;
1341     }
1342     break;
1343   case 4:
1344     Channel = ElemIdx;
1345     PtrIncr = 0;
1346     break;
1347   }
1348 }
1349
1350 SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
1351   SDLoc DL(Op);
1352   StoreSDNode *StoreNode = cast<StoreSDNode>(Op);
1353   SDValue Chain = Op.getOperand(0);
1354   SDValue Value = Op.getOperand(1);
1355   SDValue Ptr = Op.getOperand(2);
1356
1357   SDValue Result = AMDGPUTargetLowering::LowerSTORE(Op, DAG);
1358   if (Result.getNode()) {
1359     return Result;
1360   }
1361
1362   if (StoreNode->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS) {
1363     if (StoreNode->isTruncatingStore()) {
1364       EVT VT = Value.getValueType();
1365       assert(VT.bitsLE(MVT::i32));
1366       EVT MemVT = StoreNode->getMemoryVT();
1367       SDValue MaskConstant;
1368       if (MemVT == MVT::i8) {
1369         MaskConstant = DAG.getConstant(0xFF, MVT::i32);
1370       } else {
1371         assert(MemVT == MVT::i16);
1372         MaskConstant = DAG.getConstant(0xFFFF, MVT::i32);
1373       }
1374       SDValue DWordAddr = DAG.getNode(ISD::SRL, DL, VT, Ptr,
1375                                       DAG.getConstant(2, MVT::i32));
1376       SDValue ByteIndex = DAG.getNode(ISD::AND, DL, Ptr.getValueType(), Ptr,
1377                                       DAG.getConstant(0x00000003, VT));
1378       SDValue TruncValue = DAG.getNode(ISD::AND, DL, VT, Value, MaskConstant);
1379       SDValue Shift = DAG.getNode(ISD::SHL, DL, VT, ByteIndex,
1380                                    DAG.getConstant(3, VT));
1381       SDValue ShiftedValue = DAG.getNode(ISD::SHL, DL, VT, TruncValue, Shift);
1382       SDValue Mask = DAG.getNode(ISD::SHL, DL, VT, MaskConstant, Shift);
1383       // XXX: If we add a 64-bit ZW register class, then we could use a 2 x i32
1384       // vector instead.
1385       SDValue Src[4] = {
1386         ShiftedValue,
1387         DAG.getConstant(0, MVT::i32),
1388         DAG.getConstant(0, MVT::i32),
1389         Mask
1390       };
1391       SDValue Input = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v4i32, Src);
1392       SDValue Args[3] = { Chain, Input, DWordAddr };
1393       return DAG.getMemIntrinsicNode(AMDGPUISD::STORE_MSKOR, DL,
1394                                      Op->getVTList(), Args, MemVT,
1395                                      StoreNode->getMemOperand());
1396     } else if (Ptr->getOpcode() != AMDGPUISD::DWORDADDR &&
1397                Value.getValueType().bitsGE(MVT::i32)) {
1398       // Convert pointer from byte address to dword address.
1399       Ptr = DAG.getNode(AMDGPUISD::DWORDADDR, DL, Ptr.getValueType(),
1400                         DAG.getNode(ISD::SRL, DL, Ptr.getValueType(),
1401                                     Ptr, DAG.getConstant(2, MVT::i32)));
1402
1403       if (StoreNode->isTruncatingStore() || StoreNode->isIndexed()) {
1404         llvm_unreachable("Truncated and indexed stores not supported yet");
1405       } else {
1406         Chain = DAG.getStore(Chain, DL, Value, Ptr, StoreNode->getMemOperand());
1407       }
1408       return Chain;
1409     }
1410   }
1411
1412   EVT ValueVT = Value.getValueType();
1413
1414   if (StoreNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) {
1415     return SDValue();
1416   }
1417
1418   SDValue Ret = AMDGPUTargetLowering::LowerSTORE(Op, DAG);
1419   if (Ret.getNode()) {
1420     return Ret;
1421   }
1422   // Lowering for indirect addressing
1423
1424   const MachineFunction &MF = DAG.getMachineFunction();
1425   const AMDGPUFrameLowering *TFL = static_cast<const AMDGPUFrameLowering*>(
1426                                          getTargetMachine().getFrameLowering());
1427   unsigned StackWidth = TFL->getStackWidth(MF);
1428
1429   Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG);
1430
1431   if (ValueVT.isVector()) {
1432     unsigned NumElemVT = ValueVT.getVectorNumElements();
1433     EVT ElemVT = ValueVT.getVectorElementType();
1434     SmallVector<SDValue, 4> Stores(NumElemVT);
1435
1436     assert(NumElemVT >= StackWidth && "Stack width cannot be greater than "
1437                                       "vector width in load");
1438
1439     for (unsigned i = 0; i < NumElemVT; ++i) {
1440       unsigned Channel, PtrIncr;
1441       getStackAddress(StackWidth, i, Channel, PtrIncr);
1442       Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr,
1443                         DAG.getConstant(PtrIncr, MVT::i32));
1444       SDValue Elem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ElemVT,
1445                                  Value, DAG.getConstant(i, MVT::i32));
1446
1447       Stores[i] = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other,
1448                               Chain, Elem, Ptr,
1449                               DAG.getTargetConstant(Channel, MVT::i32));
1450     }
1451      Chain =  DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores);
1452    } else {
1453     if (ValueVT == MVT::i8) {
1454       Value = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Value);
1455     }
1456     Chain = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other, Chain, Value, Ptr,
1457     DAG.getTargetConstant(0, MVT::i32)); // Channel
1458   }
1459
1460   return Chain;
1461 }
1462
1463 // return (512 + (kc_bank << 12)
1464 static int
1465 ConstantAddressBlock(unsigned AddressSpace) {
1466   switch (AddressSpace) {
1467   case AMDGPUAS::CONSTANT_BUFFER_0:
1468     return 512;
1469   case AMDGPUAS::CONSTANT_BUFFER_1:
1470     return 512 + 4096;
1471   case AMDGPUAS::CONSTANT_BUFFER_2:
1472     return 512 + 4096 * 2;
1473   case AMDGPUAS::CONSTANT_BUFFER_3:
1474     return 512 + 4096 * 3;
1475   case AMDGPUAS::CONSTANT_BUFFER_4:
1476     return 512 + 4096 * 4;
1477   case AMDGPUAS::CONSTANT_BUFFER_5:
1478     return 512 + 4096 * 5;
1479   case AMDGPUAS::CONSTANT_BUFFER_6:
1480     return 512 + 4096 * 6;
1481   case AMDGPUAS::CONSTANT_BUFFER_7:
1482     return 512 + 4096 * 7;
1483   case AMDGPUAS::CONSTANT_BUFFER_8:
1484     return 512 + 4096 * 8;
1485   case AMDGPUAS::CONSTANT_BUFFER_9:
1486     return 512 + 4096 * 9;
1487   case AMDGPUAS::CONSTANT_BUFFER_10:
1488     return 512 + 4096 * 10;
1489   case AMDGPUAS::CONSTANT_BUFFER_11:
1490     return 512 + 4096 * 11;
1491   case AMDGPUAS::CONSTANT_BUFFER_12:
1492     return 512 + 4096 * 12;
1493   case AMDGPUAS::CONSTANT_BUFFER_13:
1494     return 512 + 4096 * 13;
1495   case AMDGPUAS::CONSTANT_BUFFER_14:
1496     return 512 + 4096 * 14;
1497   case AMDGPUAS::CONSTANT_BUFFER_15:
1498     return 512 + 4096 * 15;
1499   default:
1500     return -1;
1501   }
1502 }
1503
1504 SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const
1505 {
1506   EVT VT = Op.getValueType();
1507   SDLoc DL(Op);
1508   LoadSDNode *LoadNode = cast<LoadSDNode>(Op);
1509   SDValue Chain = Op.getOperand(0);
1510   SDValue Ptr = Op.getOperand(1);
1511   SDValue LoweredLoad;
1512
1513   SDValue Ret = AMDGPUTargetLowering::LowerLOAD(Op, DAG);
1514   if (Ret.getNode()) {
1515     SDValue Ops[2] = {
1516       Ret,
1517       Chain
1518     };
1519     return DAG.getMergeValues(Ops, DL);
1520   }
1521
1522
1523   if (LoadNode->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS && VT.isVector()) {
1524     SDValue MergedValues[2] = {
1525       SplitVectorLoad(Op, DAG),
1526       Chain
1527     };
1528     return DAG.getMergeValues(MergedValues, DL);
1529   }
1530
1531   int ConstantBlock = ConstantAddressBlock(LoadNode->getAddressSpace());
1532   if (ConstantBlock > -1 &&
1533       ((LoadNode->getExtensionType() == ISD::NON_EXTLOAD) ||
1534        (LoadNode->getExtensionType() == ISD::ZEXTLOAD))) {
1535     SDValue Result;
1536     if (isa<ConstantExpr>(LoadNode->getMemOperand()->getValue()) ||
1537         isa<Constant>(LoadNode->getMemOperand()->getValue()) ||
1538         isa<ConstantSDNode>(Ptr)) {
1539       SDValue Slots[4];
1540       for (unsigned i = 0; i < 4; i++) {
1541         // We want Const position encoded with the following formula :
1542         // (((512 + (kc_bank << 12) + const_index) << 2) + chan)
1543         // const_index is Ptr computed by llvm using an alignment of 16.
1544         // Thus we add (((512 + (kc_bank << 12)) + chan ) * 4 here and
1545         // then div by 4 at the ISel step
1546         SDValue NewPtr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
1547             DAG.getConstant(4 * i + ConstantBlock * 16, MVT::i32));
1548         Slots[i] = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::i32, NewPtr);
1549       }
1550       EVT NewVT = MVT::v4i32;
1551       unsigned NumElements = 4;
1552       if (VT.isVector()) {
1553         NewVT = VT;
1554         NumElements = VT.getVectorNumElements();
1555       }
1556       Result = DAG.getNode(ISD::BUILD_VECTOR, DL, NewVT,
1557                            makeArrayRef(Slots, NumElements));
1558     } else {
1559       // non-constant ptr can't be folded, keeps it as a v4f32 load
1560       Result = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::v4i32,
1561           DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr, DAG.getConstant(4, MVT::i32)),
1562           DAG.getConstant(LoadNode->getAddressSpace() -
1563                           AMDGPUAS::CONSTANT_BUFFER_0, MVT::i32)
1564           );
1565     }
1566
1567     if (!VT.isVector()) {
1568       Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Result,
1569           DAG.getConstant(0, MVT::i32));
1570     }
1571
1572     SDValue MergedValues[2] = {
1573       Result,
1574       Chain
1575     };
1576     return DAG.getMergeValues(MergedValues, DL);
1577   }
1578
1579   // For most operations returning SDValue() will result in the node being
1580   // expanded by the DAG Legalizer. This is not the case for ISD::LOAD, so we
1581   // need to manually expand loads that may be legal in some address spaces and
1582   // illegal in others. SEXT loads from CONSTANT_BUFFER_0 are supported for
1583   // compute shaders, since the data is sign extended when it is uploaded to the
1584   // buffer. However SEXT loads from other address spaces are not supported, so
1585   // we need to expand them here.
1586   if (LoadNode->getExtensionType() == ISD::SEXTLOAD) {
1587     EVT MemVT = LoadNode->getMemoryVT();
1588     assert(!MemVT.isVector() && (MemVT == MVT::i16 || MemVT == MVT::i8));
1589     SDValue ShiftAmount =
1590           DAG.getConstant(VT.getSizeInBits() - MemVT.getSizeInBits(), MVT::i32);
1591     SDValue NewLoad = DAG.getExtLoad(ISD::EXTLOAD, DL, VT, Chain, Ptr,
1592                                   LoadNode->getPointerInfo(), MemVT,
1593                                   LoadNode->isVolatile(),
1594                                   LoadNode->isNonTemporal(),
1595                                   LoadNode->getAlignment());
1596     SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, NewLoad, ShiftAmount);
1597     SDValue Sra = DAG.getNode(ISD::SRA, DL, VT, Shl, ShiftAmount);
1598
1599     SDValue MergedValues[2] = { Sra, Chain };
1600     return DAG.getMergeValues(MergedValues, DL);
1601   }
1602
1603   if (LoadNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) {
1604     return SDValue();
1605   }
1606
1607   // Lowering for indirect addressing
1608   const MachineFunction &MF = DAG.getMachineFunction();
1609   const AMDGPUFrameLowering *TFL = static_cast<const AMDGPUFrameLowering*>(
1610                                          getTargetMachine().getFrameLowering());
1611   unsigned StackWidth = TFL->getStackWidth(MF);
1612
1613   Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG);
1614
1615   if (VT.isVector()) {
1616     unsigned NumElemVT = VT.getVectorNumElements();
1617     EVT ElemVT = VT.getVectorElementType();
1618     SDValue Loads[4];
1619
1620     assert(NumElemVT >= StackWidth && "Stack width cannot be greater than "
1621                                       "vector width in load");
1622
1623     for (unsigned i = 0; i < NumElemVT; ++i) {
1624       unsigned Channel, PtrIncr;
1625       getStackAddress(StackWidth, i, Channel, PtrIncr);
1626       Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr,
1627                         DAG.getConstant(PtrIncr, MVT::i32));
1628       Loads[i] = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, ElemVT,
1629                              Chain, Ptr,
1630                              DAG.getTargetConstant(Channel, MVT::i32),
1631                              Op.getOperand(2));
1632     }
1633     for (unsigned i = NumElemVT; i < 4; ++i) {
1634       Loads[i] = DAG.getUNDEF(ElemVT);
1635     }
1636     EVT TargetVT = EVT::getVectorVT(*DAG.getContext(), ElemVT, 4);
1637     LoweredLoad = DAG.getNode(ISD::BUILD_VECTOR, DL, TargetVT, Loads);
1638   } else {
1639     LoweredLoad = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, VT,
1640                               Chain, Ptr,
1641                               DAG.getTargetConstant(0, MVT::i32), // Channel
1642                               Op.getOperand(2));
1643   }
1644
1645   SDValue Ops[2] = {
1646     LoweredLoad,
1647     Chain
1648   };
1649
1650   return DAG.getMergeValues(Ops, DL);
1651 }
1652
1653 SDValue R600TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
1654   SDValue Chain = Op.getOperand(0);
1655   SDValue Cond  = Op.getOperand(1);
1656   SDValue Jump  = Op.getOperand(2);
1657
1658   return DAG.getNode(AMDGPUISD::BRANCH_COND, SDLoc(Op), Op.getValueType(),
1659                      Chain, Jump, Cond);
1660 }
1661
1662 /// XXX Only kernel functions are supported, so we can assume for now that
1663 /// every function is a kernel function, but in the future we should use
1664 /// separate calling conventions for kernel and non-kernel functions.
1665 SDValue R600TargetLowering::LowerFormalArguments(
1666                                       SDValue Chain,
1667                                       CallingConv::ID CallConv,
1668                                       bool isVarArg,
1669                                       const SmallVectorImpl<ISD::InputArg> &Ins,
1670                                       SDLoc DL, SelectionDAG &DAG,
1671                                       SmallVectorImpl<SDValue> &InVals) const {
1672   SmallVector<CCValAssign, 16> ArgLocs;
1673   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
1674                  getTargetMachine(), ArgLocs, *DAG.getContext());
1675   MachineFunction &MF = DAG.getMachineFunction();
1676   unsigned ShaderType = MF.getInfo<R600MachineFunctionInfo>()->ShaderType;
1677
1678   SmallVector<ISD::InputArg, 8> LocalIns;
1679
1680   getOriginalFunctionArgs(DAG, MF.getFunction(), Ins, LocalIns);
1681
1682   AnalyzeFormalArguments(CCInfo, LocalIns);
1683
1684   for (unsigned i = 0, e = Ins.size(); i < e; ++i) {
1685     CCValAssign &VA = ArgLocs[i];
1686     EVT VT = Ins[i].VT;
1687     EVT MemVT = LocalIns[i].VT;
1688
1689     if (ShaderType != ShaderType::COMPUTE) {
1690       unsigned Reg = MF.addLiveIn(VA.getLocReg(), &AMDGPU::R600_Reg128RegClass);
1691       SDValue Register = DAG.getCopyFromReg(Chain, DL, Reg, VT);
1692       InVals.push_back(Register);
1693       continue;
1694     }
1695
1696     PointerType *PtrTy = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
1697                                                    AMDGPUAS::CONSTANT_BUFFER_0);
1698
1699     // i64 isn't a legal type, so the register type used ends up as i32, which
1700     // isn't expected here. It attempts to create this sextload, but it ends up
1701     // being invalid. Somehow this seems to work with i64 arguments, but breaks
1702     // for <1 x i64>.
1703
1704     // The first 36 bytes of the input buffer contains information about
1705     // thread group and global sizes.
1706
1707     // FIXME: This should really check the extload type, but the handling of
1708     // extload vecto parameters seems to be broken.
1709     //ISD::LoadExtType Ext = Ins[i].Flags.isSExt() ? ISD::SEXTLOAD : ISD::ZEXTLOAD;
1710     ISD::LoadExtType Ext = ISD::SEXTLOAD;
1711     SDValue Arg = DAG.getExtLoad(Ext, DL, VT, Chain,
1712                                  DAG.getConstant(36 + VA.getLocMemOffset(), MVT::i32),
1713                                  MachinePointerInfo(UndefValue::get(PtrTy)),
1714                                  MemVT, false, false, 4);
1715
1716     // 4 is the preferred alignment for the CONSTANT memory space.
1717     InVals.push_back(Arg);
1718   }
1719   return Chain;
1720 }
1721
1722 EVT R600TargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const {
1723    if (!VT.isVector())
1724      return MVT::i32;
1725    return VT.changeVectorElementTypeToInteger();
1726 }
1727
1728 static SDValue CompactSwizzlableVector(
1729   SelectionDAG &DAG, SDValue VectorEntry,
1730   DenseMap<unsigned, unsigned> &RemapSwizzle) {
1731   assert(VectorEntry.getOpcode() == ISD::BUILD_VECTOR);
1732   assert(RemapSwizzle.empty());
1733   SDValue NewBldVec[4] = {
1734     VectorEntry.getOperand(0),
1735     VectorEntry.getOperand(1),
1736     VectorEntry.getOperand(2),
1737     VectorEntry.getOperand(3)
1738   };
1739
1740   for (unsigned i = 0; i < 4; i++) {
1741     if (NewBldVec[i].getOpcode() == ISD::UNDEF)
1742       // We mask write here to teach later passes that the ith element of this
1743       // vector is undef. Thus we can use it to reduce 128 bits reg usage,
1744       // break false dependencies and additionnaly make assembly easier to read.
1745       RemapSwizzle[i] = 7; // SEL_MASK_WRITE
1746     if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(NewBldVec[i])) {
1747       if (C->isZero()) {
1748         RemapSwizzle[i] = 4; // SEL_0
1749         NewBldVec[i] = DAG.getUNDEF(MVT::f32);
1750       } else if (C->isExactlyValue(1.0)) {
1751         RemapSwizzle[i] = 5; // SEL_1
1752         NewBldVec[i] = DAG.getUNDEF(MVT::f32);
1753       }
1754     }
1755
1756     if (NewBldVec[i].getOpcode() == ISD::UNDEF)
1757       continue;
1758     for (unsigned j = 0; j < i; j++) {
1759       if (NewBldVec[i] == NewBldVec[j]) {
1760         NewBldVec[i] = DAG.getUNDEF(NewBldVec[i].getValueType());
1761         RemapSwizzle[i] = j;
1762         break;
1763       }
1764     }
1765   }
1766
1767   return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(VectorEntry),
1768                      VectorEntry.getValueType(), NewBldVec);
1769 }
1770
1771 static SDValue ReorganizeVector(SelectionDAG &DAG, SDValue VectorEntry,
1772                                 DenseMap<unsigned, unsigned> &RemapSwizzle) {
1773   assert(VectorEntry.getOpcode() == ISD::BUILD_VECTOR);
1774   assert(RemapSwizzle.empty());
1775   SDValue NewBldVec[4] = {
1776       VectorEntry.getOperand(0),
1777       VectorEntry.getOperand(1),
1778       VectorEntry.getOperand(2),
1779       VectorEntry.getOperand(3)
1780   };
1781   bool isUnmovable[4] = { false, false, false, false };
1782   for (unsigned i = 0; i < 4; i++) {
1783     RemapSwizzle[i] = i;
1784     if (NewBldVec[i].getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
1785       unsigned Idx = dyn_cast<ConstantSDNode>(NewBldVec[i].getOperand(1))
1786           ->getZExtValue();
1787       if (i == Idx)
1788         isUnmovable[Idx] = true;
1789     }
1790   }
1791
1792   for (unsigned i = 0; i < 4; i++) {
1793     if (NewBldVec[i].getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
1794       unsigned Idx = dyn_cast<ConstantSDNode>(NewBldVec[i].getOperand(1))
1795           ->getZExtValue();
1796       if (isUnmovable[Idx])
1797         continue;
1798       // Swap i and Idx
1799       std::swap(NewBldVec[Idx], NewBldVec[i]);
1800       std::swap(RemapSwizzle[i], RemapSwizzle[Idx]);
1801       break;
1802     }
1803   }
1804
1805   return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(VectorEntry),
1806                      VectorEntry.getValueType(), NewBldVec);
1807 }
1808
1809
1810 SDValue R600TargetLowering::OptimizeSwizzle(SDValue BuildVector,
1811 SDValue Swz[4], SelectionDAG &DAG) const {
1812   assert(BuildVector.getOpcode() == ISD::BUILD_VECTOR);
1813   // Old -> New swizzle values
1814   DenseMap<unsigned, unsigned> SwizzleRemap;
1815
1816   BuildVector = CompactSwizzlableVector(DAG, BuildVector, SwizzleRemap);
1817   for (unsigned i = 0; i < 4; i++) {
1818     unsigned Idx = dyn_cast<ConstantSDNode>(Swz[i])->getZExtValue();
1819     if (SwizzleRemap.find(Idx) != SwizzleRemap.end())
1820       Swz[i] = DAG.getConstant(SwizzleRemap[Idx], MVT::i32);
1821   }
1822
1823   SwizzleRemap.clear();
1824   BuildVector = ReorganizeVector(DAG, BuildVector, SwizzleRemap);
1825   for (unsigned i = 0; i < 4; i++) {
1826     unsigned Idx = dyn_cast<ConstantSDNode>(Swz[i])->getZExtValue();
1827     if (SwizzleRemap.find(Idx) != SwizzleRemap.end())
1828       Swz[i] = DAG.getConstant(SwizzleRemap[Idx], MVT::i32);
1829   }
1830
1831   return BuildVector;
1832 }
1833
1834
1835 //===----------------------------------------------------------------------===//
1836 // Custom DAG Optimizations
1837 //===----------------------------------------------------------------------===//
1838
1839 SDValue R600TargetLowering::PerformDAGCombine(SDNode *N,
1840                                               DAGCombinerInfo &DCI) const {
1841   SelectionDAG &DAG = DCI.DAG;
1842
1843   switch (N->getOpcode()) {
1844   default: return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
1845   // (f32 fp_round (f64 uint_to_fp a)) -> (f32 uint_to_fp a)
1846   case ISD::FP_ROUND: {
1847       SDValue Arg = N->getOperand(0);
1848       if (Arg.getOpcode() == ISD::UINT_TO_FP && Arg.getValueType() == MVT::f64) {
1849         return DAG.getNode(ISD::UINT_TO_FP, SDLoc(N), N->getValueType(0),
1850                            Arg.getOperand(0));
1851       }
1852       break;
1853     }
1854
1855   // (i32 fp_to_sint (fneg (select_cc f32, f32, 1.0, 0.0 cc))) ->
1856   // (i32 select_cc f32, f32, -1, 0 cc)
1857   //
1858   // Mesa's GLSL frontend generates the above pattern a lot and we can lower
1859   // this to one of the SET*_DX10 instructions.
1860   case ISD::FP_TO_SINT: {
1861     SDValue FNeg = N->getOperand(0);
1862     if (FNeg.getOpcode() != ISD::FNEG) {
1863       return SDValue();
1864     }
1865     SDValue SelectCC = FNeg.getOperand(0);
1866     if (SelectCC.getOpcode() != ISD::SELECT_CC ||
1867         SelectCC.getOperand(0).getValueType() != MVT::f32 || // LHS
1868         SelectCC.getOperand(2).getValueType() != MVT::f32 || // True
1869         !isHWTrueValue(SelectCC.getOperand(2)) ||
1870         !isHWFalseValue(SelectCC.getOperand(3))) {
1871       return SDValue();
1872     }
1873
1874     return DAG.getNode(ISD::SELECT_CC, SDLoc(N), N->getValueType(0),
1875                            SelectCC.getOperand(0), // LHS
1876                            SelectCC.getOperand(1), // RHS
1877                            DAG.getConstant(-1, MVT::i32), // True
1878                            DAG.getConstant(0, MVT::i32),  // Flase
1879                            SelectCC.getOperand(4)); // CC
1880
1881     break;
1882   }
1883
1884   // insert_vector_elt (build_vector elt0, ... , eltN), NewEltIdx, idx
1885   // => build_vector elt0, ... , NewEltIdx, ... , eltN
1886   case ISD::INSERT_VECTOR_ELT: {
1887     SDValue InVec = N->getOperand(0);
1888     SDValue InVal = N->getOperand(1);
1889     SDValue EltNo = N->getOperand(2);
1890     SDLoc dl(N);
1891
1892     // If the inserted element is an UNDEF, just use the input vector.
1893     if (InVal.getOpcode() == ISD::UNDEF)
1894       return InVec;
1895
1896     EVT VT = InVec.getValueType();
1897
1898     // If we can't generate a legal BUILD_VECTOR, exit
1899     if (!isOperationLegal(ISD::BUILD_VECTOR, VT))
1900       return SDValue();
1901
1902     // Check that we know which element is being inserted
1903     if (!isa<ConstantSDNode>(EltNo))
1904       return SDValue();
1905     unsigned Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
1906
1907     // Check that the operand is a BUILD_VECTOR (or UNDEF, which can essentially
1908     // be converted to a BUILD_VECTOR).  Fill in the Ops vector with the
1909     // vector elements.
1910     SmallVector<SDValue, 8> Ops;
1911     if (InVec.getOpcode() == ISD::BUILD_VECTOR) {
1912       Ops.append(InVec.getNode()->op_begin(),
1913                  InVec.getNode()->op_end());
1914     } else if (InVec.getOpcode() == ISD::UNDEF) {
1915       unsigned NElts = VT.getVectorNumElements();
1916       Ops.append(NElts, DAG.getUNDEF(InVal.getValueType()));
1917     } else {
1918       return SDValue();
1919     }
1920
1921     // Insert the element
1922     if (Elt < Ops.size()) {
1923       // All the operands of BUILD_VECTOR must have the same type;
1924       // we enforce that here.
1925       EVT OpVT = Ops[0].getValueType();
1926       if (InVal.getValueType() != OpVT)
1927         InVal = OpVT.bitsGT(InVal.getValueType()) ?
1928           DAG.getNode(ISD::ANY_EXTEND, dl, OpVT, InVal) :
1929           DAG.getNode(ISD::TRUNCATE, dl, OpVT, InVal);
1930       Ops[Elt] = InVal;
1931     }
1932
1933     // Return the new vector
1934     return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
1935   }
1936
1937   // Extract_vec (Build_vector) generated by custom lowering
1938   // also needs to be customly combined
1939   case ISD::EXTRACT_VECTOR_ELT: {
1940     SDValue Arg = N->getOperand(0);
1941     if (Arg.getOpcode() == ISD::BUILD_VECTOR) {
1942       if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
1943         unsigned Element = Const->getZExtValue();
1944         return Arg->getOperand(Element);
1945       }
1946     }
1947     if (Arg.getOpcode() == ISD::BITCAST &&
1948         Arg.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
1949       if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
1950         unsigned Element = Const->getZExtValue();
1951         return DAG.getNode(ISD::BITCAST, SDLoc(N), N->getVTList(),
1952             Arg->getOperand(0).getOperand(Element));
1953       }
1954     }
1955   }
1956
1957   case ISD::SELECT_CC: {
1958     // Try common optimizations
1959     SDValue Ret = AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
1960     if (Ret.getNode())
1961       return Ret;
1962
1963     // fold selectcc (selectcc x, y, a, b, cc), b, a, b, seteq ->
1964     //      selectcc x, y, a, b, inv(cc)
1965     //
1966     // fold selectcc (selectcc x, y, a, b, cc), b, a, b, setne ->
1967     //      selectcc x, y, a, b, cc
1968     SDValue LHS = N->getOperand(0);
1969     if (LHS.getOpcode() != ISD::SELECT_CC) {
1970       return SDValue();
1971     }
1972
1973     SDValue RHS = N->getOperand(1);
1974     SDValue True = N->getOperand(2);
1975     SDValue False = N->getOperand(3);
1976     ISD::CondCode NCC = cast<CondCodeSDNode>(N->getOperand(4))->get();
1977
1978     if (LHS.getOperand(2).getNode() != True.getNode() ||
1979         LHS.getOperand(3).getNode() != False.getNode() ||
1980         RHS.getNode() != False.getNode()) {
1981       return SDValue();
1982     }
1983
1984     switch (NCC) {
1985     default: return SDValue();
1986     case ISD::SETNE: return LHS;
1987     case ISD::SETEQ: {
1988       ISD::CondCode LHSCC = cast<CondCodeSDNode>(LHS.getOperand(4))->get();
1989       LHSCC = ISD::getSetCCInverse(LHSCC,
1990                                   LHS.getOperand(0).getValueType().isInteger());
1991       if (DCI.isBeforeLegalizeOps() ||
1992           isCondCodeLegal(LHSCC, LHS.getOperand(0).getSimpleValueType()))
1993         return DAG.getSelectCC(SDLoc(N),
1994                                LHS.getOperand(0),
1995                                LHS.getOperand(1),
1996                                LHS.getOperand(2),
1997                                LHS.getOperand(3),
1998                                LHSCC);
1999       break;
2000     }
2001     }
2002     return SDValue();
2003   }
2004
2005   case AMDGPUISD::EXPORT: {
2006     SDValue Arg = N->getOperand(1);
2007     if (Arg.getOpcode() != ISD::BUILD_VECTOR)
2008       break;
2009
2010     SDValue NewArgs[8] = {
2011       N->getOperand(0), // Chain
2012       SDValue(),
2013       N->getOperand(2), // ArrayBase
2014       N->getOperand(3), // Type
2015       N->getOperand(4), // SWZ_X
2016       N->getOperand(5), // SWZ_Y
2017       N->getOperand(6), // SWZ_Z
2018       N->getOperand(7) // SWZ_W
2019     };
2020     SDLoc DL(N);
2021     NewArgs[1] = OptimizeSwizzle(N->getOperand(1), &NewArgs[4], DAG);
2022     return DAG.getNode(AMDGPUISD::EXPORT, DL, N->getVTList(), NewArgs);
2023   }
2024   case AMDGPUISD::TEXTURE_FETCH: {
2025     SDValue Arg = N->getOperand(1);
2026     if (Arg.getOpcode() != ISD::BUILD_VECTOR)
2027       break;
2028
2029     SDValue NewArgs[19] = {
2030       N->getOperand(0),
2031       N->getOperand(1),
2032       N->getOperand(2),
2033       N->getOperand(3),
2034       N->getOperand(4),
2035       N->getOperand(5),
2036       N->getOperand(6),
2037       N->getOperand(7),
2038       N->getOperand(8),
2039       N->getOperand(9),
2040       N->getOperand(10),
2041       N->getOperand(11),
2042       N->getOperand(12),
2043       N->getOperand(13),
2044       N->getOperand(14),
2045       N->getOperand(15),
2046       N->getOperand(16),
2047       N->getOperand(17),
2048       N->getOperand(18),
2049     };
2050     NewArgs[1] = OptimizeSwizzle(N->getOperand(1), &NewArgs[2], DAG);
2051     return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, SDLoc(N), N->getVTList(),
2052         NewArgs);
2053   }
2054   }
2055
2056   return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
2057 }
2058
2059 static bool
2060 FoldOperand(SDNode *ParentNode, unsigned SrcIdx, SDValue &Src, SDValue &Neg,
2061             SDValue &Abs, SDValue &Sel, SDValue &Imm, SelectionDAG &DAG) {
2062   const R600InstrInfo *TII =
2063       static_cast<const R600InstrInfo *>(DAG.getTarget().getInstrInfo());
2064   if (!Src.isMachineOpcode())
2065     return false;
2066   switch (Src.getMachineOpcode()) {
2067   case AMDGPU::FNEG_R600:
2068     if (!Neg.getNode())
2069       return false;
2070     Src = Src.getOperand(0);
2071     Neg = DAG.getTargetConstant(1, MVT::i32);
2072     return true;
2073   case AMDGPU::FABS_R600:
2074     if (!Abs.getNode())
2075       return false;
2076     Src = Src.getOperand(0);
2077     Abs = DAG.getTargetConstant(1, MVT::i32);
2078     return true;
2079   case AMDGPU::CONST_COPY: {
2080     unsigned Opcode = ParentNode->getMachineOpcode();
2081     bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1;
2082
2083     if (!Sel.getNode())
2084       return false;
2085
2086     SDValue CstOffset = Src.getOperand(0);
2087     if (ParentNode->getValueType(0).isVector())
2088       return false;
2089
2090     // Gather constants values
2091     int SrcIndices[] = {
2092       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0),
2093       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1),
2094       TII->getOperandIdx(Opcode, AMDGPU::OpName::src2),
2095       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_X),
2096       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Y),
2097       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Z),
2098       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_W),
2099       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_X),
2100       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Y),
2101       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Z),
2102       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_W)
2103     };
2104     std::vector<unsigned> Consts;
2105     for (int OtherSrcIdx : SrcIndices) {
2106       int OtherSelIdx = TII->getSelIdx(Opcode, OtherSrcIdx);
2107       if (OtherSrcIdx < 0 || OtherSelIdx < 0)
2108         continue;
2109       if (HasDst) {
2110         OtherSrcIdx--;
2111         OtherSelIdx--;
2112       }
2113       if (RegisterSDNode *Reg =
2114           dyn_cast<RegisterSDNode>(ParentNode->getOperand(OtherSrcIdx))) {
2115         if (Reg->getReg() == AMDGPU::ALU_CONST) {
2116           ConstantSDNode *Cst
2117             = cast<ConstantSDNode>(ParentNode->getOperand(OtherSelIdx));
2118           Consts.push_back(Cst->getZExtValue());
2119         }
2120       }
2121     }
2122
2123     ConstantSDNode *Cst = cast<ConstantSDNode>(CstOffset);
2124     Consts.push_back(Cst->getZExtValue());
2125     if (!TII->fitsConstReadLimitations(Consts)) {
2126       return false;
2127     }
2128
2129     Sel = CstOffset;
2130     Src = DAG.getRegister(AMDGPU::ALU_CONST, MVT::f32);
2131     return true;
2132   }
2133   case AMDGPU::MOV_IMM_I32:
2134   case AMDGPU::MOV_IMM_F32: {
2135     unsigned ImmReg = AMDGPU::ALU_LITERAL_X;
2136     uint64_t ImmValue = 0;
2137
2138
2139     if (Src.getMachineOpcode() == AMDGPU::MOV_IMM_F32) {
2140       ConstantFPSDNode *FPC = dyn_cast<ConstantFPSDNode>(Src.getOperand(0));
2141       float FloatValue = FPC->getValueAPF().convertToFloat();
2142       if (FloatValue == 0.0) {
2143         ImmReg = AMDGPU::ZERO;
2144       } else if (FloatValue == 0.5) {
2145         ImmReg = AMDGPU::HALF;
2146       } else if (FloatValue == 1.0) {
2147         ImmReg = AMDGPU::ONE;
2148       } else {
2149         ImmValue = FPC->getValueAPF().bitcastToAPInt().getZExtValue();
2150       }
2151     } else {
2152       ConstantSDNode *C = dyn_cast<ConstantSDNode>(Src.getOperand(0));
2153       uint64_t Value = C->getZExtValue();
2154       if (Value == 0) {
2155         ImmReg = AMDGPU::ZERO;
2156       } else if (Value == 1) {
2157         ImmReg = AMDGPU::ONE_INT;
2158       } else {
2159         ImmValue = Value;
2160       }
2161     }
2162
2163     // Check that we aren't already using an immediate.
2164     // XXX: It's possible for an instruction to have more than one
2165     // immediate operand, but this is not supported yet.
2166     if (ImmReg == AMDGPU::ALU_LITERAL_X) {
2167       if (!Imm.getNode())
2168         return false;
2169       ConstantSDNode *C = dyn_cast<ConstantSDNode>(Imm);
2170       assert(C);
2171       if (C->getZExtValue())
2172         return false;
2173       Imm = DAG.getTargetConstant(ImmValue, MVT::i32);
2174     }
2175     Src = DAG.getRegister(ImmReg, MVT::i32);
2176     return true;
2177   }
2178   default:
2179     return false;
2180   }
2181 }
2182
2183
2184 /// \brief Fold the instructions after selecting them
2185 SDNode *R600TargetLowering::PostISelFolding(MachineSDNode *Node,
2186                                             SelectionDAG &DAG) const {
2187   const R600InstrInfo *TII =
2188       static_cast<const R600InstrInfo *>(DAG.getTarget().getInstrInfo());
2189   if (!Node->isMachineOpcode())
2190     return Node;
2191   unsigned Opcode = Node->getMachineOpcode();
2192   SDValue FakeOp;
2193
2194   std::vector<SDValue> Ops;
2195   for(SDNode::op_iterator I = Node->op_begin(), E = Node->op_end();
2196               I != E; ++I)
2197           Ops.push_back(*I);
2198
2199   if (Opcode == AMDGPU::DOT_4) {
2200     int OperandIdx[] = {
2201       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_X),
2202       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Y),
2203       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Z),
2204       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_W),
2205       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_X),
2206       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Y),
2207       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Z),
2208       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_W)
2209         };
2210     int NegIdx[] = {
2211       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_X),
2212       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_Y),
2213       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_Z),
2214       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_W),
2215       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_X),
2216       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_Y),
2217       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_Z),
2218       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_W)
2219     };
2220     int AbsIdx[] = {
2221       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_X),
2222       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_Y),
2223       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_Z),
2224       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_W),
2225       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_X),
2226       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_Y),
2227       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_Z),
2228       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_W)
2229     };
2230     for (unsigned i = 0; i < 8; i++) {
2231       if (OperandIdx[i] < 0)
2232         return Node;
2233       SDValue &Src = Ops[OperandIdx[i] - 1];
2234       SDValue &Neg = Ops[NegIdx[i] - 1];
2235       SDValue &Abs = Ops[AbsIdx[i] - 1];
2236       bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1;
2237       int SelIdx = TII->getSelIdx(Opcode, OperandIdx[i]);
2238       if (HasDst)
2239         SelIdx--;
2240       SDValue &Sel = (SelIdx > -1) ? Ops[SelIdx] : FakeOp;
2241       if (FoldOperand(Node, i, Src, Neg, Abs, Sel, FakeOp, DAG))
2242         return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
2243     }
2244   } else if (Opcode == AMDGPU::REG_SEQUENCE) {
2245     for (unsigned i = 1, e = Node->getNumOperands(); i < e; i += 2) {
2246       SDValue &Src = Ops[i];
2247       if (FoldOperand(Node, i, Src, FakeOp, FakeOp, FakeOp, FakeOp, DAG))
2248         return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
2249     }
2250   } else if (Opcode == AMDGPU::CLAMP_R600) {
2251     SDValue Src = Node->getOperand(0);
2252     if (!Src.isMachineOpcode() ||
2253         !TII->hasInstrModifiers(Src.getMachineOpcode()))
2254       return Node;
2255     int ClampIdx = TII->getOperandIdx(Src.getMachineOpcode(),
2256         AMDGPU::OpName::clamp);
2257     if (ClampIdx < 0)
2258       return Node;
2259     std::vector<SDValue> Ops;
2260     unsigned NumOp = Src.getNumOperands();
2261     for(unsigned i = 0; i < NumOp; ++i)
2262           Ops.push_back(Src.getOperand(i));
2263     Ops[ClampIdx - 1] = DAG.getTargetConstant(1, MVT::i32);
2264     return DAG.getMachineNode(Src.getMachineOpcode(), SDLoc(Node),
2265         Node->getVTList(), Ops);
2266   } else {
2267     if (!TII->hasInstrModifiers(Opcode))
2268       return Node;
2269     int OperandIdx[] = {
2270       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0),
2271       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1),
2272       TII->getOperandIdx(Opcode, AMDGPU::OpName::src2)
2273     };
2274     int NegIdx[] = {
2275       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg),
2276       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg),
2277       TII->getOperandIdx(Opcode, AMDGPU::OpName::src2_neg)
2278     };
2279     int AbsIdx[] = {
2280       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs),
2281       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs),
2282       -1
2283     };
2284     for (unsigned i = 0; i < 3; i++) {
2285       if (OperandIdx[i] < 0)
2286         return Node;
2287       SDValue &Src = Ops[OperandIdx[i] - 1];
2288       SDValue &Neg = Ops[NegIdx[i] - 1];
2289       SDValue FakeAbs;
2290       SDValue &Abs = (AbsIdx[i] > -1) ? Ops[AbsIdx[i] - 1] : FakeAbs;
2291       bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1;
2292       int SelIdx = TII->getSelIdx(Opcode, OperandIdx[i]);
2293       int ImmIdx = TII->getOperandIdx(Opcode, AMDGPU::OpName::literal);
2294       if (HasDst) {
2295         SelIdx--;
2296         ImmIdx--;
2297       }
2298       SDValue &Sel = (SelIdx > -1) ? Ops[SelIdx] : FakeOp;
2299       SDValue &Imm = Ops[ImmIdx];
2300       if (FoldOperand(Node, i, Src, Neg, Abs, Sel, Imm, DAG))
2301         return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
2302     }
2303   }
2304
2305   return Node;
2306 }