lib/Target/R600/R600ISelLowering.cpp

   1 //===-- R600ISelLowering.cpp - R600 DAG Lowering Implementation -----------===//
   2 //
   3 //                     The LLVM Compiler Infrastructure
   4 //
   5 // This file is distributed under the University of Illinois Open Source
   6 // License. See LICENSE.TXT for details.
   7 //
   8 //===----------------------------------------------------------------------===//
   9 //
  10 /// \file
  11 /// \brief Custom DAG lowering for R600
  12 //
  13 //===----------------------------------------------------------------------===//
  14
  15 #include "R600ISelLowering.h"
  16 #include "AMDGPUFrameLowering.h"
  17 #include "AMDGPUIntrinsicInfo.h"
  18 #include "AMDGPUSubtarget.h"
  19 #include "R600Defines.h"
  20 #include "R600InstrInfo.h"
  21 #include "R600MachineFunctionInfo.h"
  22 #include "llvm/CodeGen/CallingConvLower.h"
  23 #include "llvm/CodeGen/MachineFrameInfo.h"
  24 #include "llvm/CodeGen/MachineInstrBuilder.h"
  25 #include "llvm/CodeGen/MachineRegisterInfo.h"
  26 #include "llvm/CodeGen/SelectionDAG.h"
  27 #include "llvm/IR/Argument.h"
  28 #include "llvm/IR/Function.h"
  29
  30 using namespace llvm;
  31
  32 R600TargetLowering::R600TargetLowering(TargetMachine &TM) :
  33     AMDGPUTargetLowering(TM),
  34     Gen(TM.getSubtarget<AMDGPUSubtarget>().getGeneration()) {
  35   addRegisterClass(MVT::v4f32, &AMDGPU::R600_Reg128RegClass);
  36   addRegisterClass(MVT::f32, &AMDGPU::R600_Reg32RegClass);
  37   addRegisterClass(MVT::v4i32, &AMDGPU::R600_Reg128RegClass);
  38   addRegisterClass(MVT::i32, &AMDGPU::R600_Reg32RegClass);
  39   addRegisterClass(MVT::v2f32, &AMDGPU::R600_Reg64RegClass);
  40   addRegisterClass(MVT::v2i32, &AMDGPU::R600_Reg64RegClass);
  41
  42   computeRegisterProperties();
  43
  44   // Set condition code actions
  45   setCondCodeAction(ISD::SETO,   MVT::f32, Expand);
  46   setCondCodeAction(ISD::SETUO,  MVT::f32, Expand);
  47   setCondCodeAction(ISD::SETLT,  MVT::f32, Expand);
  48   setCondCodeAction(ISD::SETLE,  MVT::f32, Expand);
  49   setCondCodeAction(ISD::SETOLT, MVT::f32, Expand);
  50   setCondCodeAction(ISD::SETOLE, MVT::f32, Expand);
  51   setCondCodeAction(ISD::SETONE, MVT::f32, Expand);
  52   setCondCodeAction(ISD::SETUEQ, MVT::f32, Expand);
  53   setCondCodeAction(ISD::SETUGE, MVT::f32, Expand);
  54   setCondCodeAction(ISD::SETUGT, MVT::f32, Expand);
  55   setCondCodeAction(ISD::SETULT, MVT::f32, Expand);
  56   setCondCodeAction(ISD::SETULE, MVT::f32, Expand);
  57
  58   setCondCodeAction(ISD::SETLE, MVT::i32, Expand);
  59   setCondCodeAction(ISD::SETLT, MVT::i32, Expand);
  60   setCondCodeAction(ISD::SETULE, MVT::i32, Expand);
  61   setCondCodeAction(ISD::SETULT, MVT::i32, Expand);
  62
  63   setOperationAction(ISD::FCOS, MVT::f32, Custom);
  64   setOperationAction(ISD::FSIN, MVT::f32, Custom);
  65
  66   setOperationAction(ISD::SETCC, MVT::v4i32, Expand);
  67   setOperationAction(ISD::SETCC, MVT::v2i32, Expand);
  68
  69   setOperationAction(ISD::BR_CC, MVT::i32, Expand);
  70   setOperationAction(ISD::BR_CC, MVT::f32, Expand);
  71   setOperationAction(ISD::BRCOND, MVT::Other, Custom);
  72
  73   setOperationAction(ISD::FSUB, MVT::f32, Expand);
  74
  75   setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
  76   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
  77   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i1, Custom);
  78
  79   setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
  80   setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
  81
  82   setOperationAction(ISD::SETCC, MVT::i32, Expand);
  83   setOperationAction(ISD::SETCC, MVT::f32, Expand);
  84   setOperationAction(ISD::FP_TO_UINT, MVT::i1, Custom);
  85   setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
  86   setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);
  87
  88   setOperationAction(ISD::SELECT, MVT::i32, Expand);
  89   setOperationAction(ISD::SELECT, MVT::f32, Expand);
  90   setOperationAction(ISD::SELECT, MVT::v2i32, Expand);
  91   setOperationAction(ISD::SELECT, MVT::v4i32, Expand);
  92
  93   // Expand sign extension of vectors
  94   if (!Subtarget->hasBFE())
  95     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
  96
  97   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i1, Expand);
  98   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i1, Expand);
  99
 100   if (!Subtarget->hasBFE())
 101     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8, Expand);
 102   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i8, Expand);
 103   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i8, Expand);
 104
 105   if (!Subtarget->hasBFE())
 106     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Expand);
 107   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Expand);
 108   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Expand);
 109
 110   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
 111   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i32, Expand);
 112   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i32, Expand);
 113
 114   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::Other, Expand);
 115
 116
 117   // Legalize loads and stores to the private address space.
 118   setOperationAction(ISD::LOAD, MVT::i32, Custom);
 119   setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
 120   setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
 121
 122   // EXTLOAD should be the same as ZEXTLOAD. It is legal for some address
 123   // spaces, so it is custom lowered to handle those where it isn't.
 124   setLoadExtAction(ISD::SEXTLOAD, MVT::i8, Custom);
 125   setLoadExtAction(ISD::SEXTLOAD, MVT::i16, Custom);
 126   setLoadExtAction(ISD::ZEXTLOAD, MVT::i8, Custom);
 127   setLoadExtAction(ISD::ZEXTLOAD, MVT::i16, Custom);
 128   setLoadExtAction(ISD::EXTLOAD, MVT::i8, Custom);
 129   setLoadExtAction(ISD::EXTLOAD, MVT::i16, Custom);
 130
 131   setOperationAction(ISD::STORE, MVT::i8, Custom);
 132   setOperationAction(ISD::STORE, MVT::i32, Custom);
 133   setOperationAction(ISD::STORE, MVT::v2i32, Custom);
 134   setOperationAction(ISD::STORE, MVT::v4i32, Custom);
 135   setTruncStoreAction(MVT::i32, MVT::i8, Custom);
 136   setTruncStoreAction(MVT::i32, MVT::i16, Custom);
 137
 138   setOperationAction(ISD::LOAD, MVT::i32, Custom);
 139   setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
 140   setOperationAction(ISD::FrameIndex, MVT::i32, Custom);
 141
 142   setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i32, Custom);
 143   setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f32, Custom);
 144   setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom);
 145   setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
 146
 147   setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i32, Custom);
 148   setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f32, Custom);
 149   setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom);
 150   setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom);
 151
 152   setTargetDAGCombine(ISD::FP_ROUND);
 153   setTargetDAGCombine(ISD::FP_TO_SINT);
 154   setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
 155   setTargetDAGCombine(ISD::SELECT_CC);
 156   setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
 157
 158   setOperationAction(ISD::SUB, MVT::i64, Expand);
 159
 160   // These should be replaced by UDVIREM, but it does not happen automatically
 161   // during Type Legalization
 162   setOperationAction(ISD::UDIV, MVT::i64, Custom);
 163   setOperationAction(ISD::UREM, MVT::i64, Custom);
 164   setOperationAction(ISD::SDIV, MVT::i64, Custom);
 165   setOperationAction(ISD::SREM, MVT::i64, Custom);
 166
 167   // We don't have 64-bit shifts. Thus we need either SHX i64 or SHX_PARTS i32
 168   //  to be Legal/Custom in order to avoid library calls.
 169   setOperationAction(ISD::SHL_PARTS, MVT::i32, Custom);
 170   setOperationAction(ISD::SRL_PARTS, MVT::i32, Custom);
 171   setOperationAction(ISD::SRA_PARTS, MVT::i32, Custom);
 172
 173   setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
 174
 175   const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 };
 176   for (MVT VT : ScalarIntVTs) {
 177     setOperationAction(ISD::ADDC, VT, Expand);
 178     setOperationAction(ISD::SUBC, VT, Expand);
 179     setOperationAction(ISD::ADDE, VT, Expand);
 180     setOperationAction(ISD::SUBE, VT, Expand);
 181   }
 182
 183   setBooleanContents(ZeroOrNegativeOneBooleanContent);
 184   setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
 185   setSchedulingPreference(Sched::Source);
 186 }
 187
 188 MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter(
 189     MachineInstr * MI, MachineBasicBlock * BB) const {
 190   MachineFunction * MF = BB->getParent();
 191   MachineRegisterInfo &MRI = MF->getRegInfo();
 192   MachineBasicBlock::iterator I = *MI;
 193   const R600InstrInfo *TII =
 194     static_cast<const R600InstrInfo*>(MF->getTarget().getInstrInfo());
 195
 196   switch (MI->getOpcode()) {
 197   default:
 198     // Replace LDS_*_RET instruction that don't have any uses with the
 199     // equivalent LDS_*_NORET instruction.
 200     if (TII->isLDSRetInstr(MI->getOpcode())) {
 201       int DstIdx = TII->getOperandIdx(MI->getOpcode(), AMDGPU::OpName::dst);
 202       assert(DstIdx != -1);
 203       MachineInstrBuilder NewMI;
 204       if (!MRI.use_empty(MI->getOperand(DstIdx).getReg()))
 205         return BB;
 206
 207       NewMI = BuildMI(*BB, I, BB->findDebugLoc(I),
 208                       TII->get(AMDGPU::getLDSNoRetOp(MI->getOpcode())));
 209       for (unsigned i = 1, e = MI->getNumOperands(); i < e; ++i) {
 210         NewMI.addOperand(MI->getOperand(i));
 211       }
 212     } else {
 213       return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
 214     }
 215     break;
 216   case AMDGPU::CLAMP_R600: {
 217     MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
 218                                                    AMDGPU::MOV,
 219                                                    MI->getOperand(0).getReg(),
 220                                                    MI->getOperand(1).getReg());
 221     TII->addFlag(NewMI, 0, MO_FLAG_CLAMP);
 222     break;
 223   }
 224
 225   case AMDGPU::FABS_R600: {
 226     MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
 227                                                     AMDGPU::MOV,
 228                                                     MI->getOperand(0).getReg(),
 229                                                     MI->getOperand(1).getReg());
 230     TII->addFlag(NewMI, 0, MO_FLAG_ABS);
 231     break;
 232   }
 233
 234   case AMDGPU::FNEG_R600: {
 235     MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
 236                                                     AMDGPU::MOV,
 237                                                     MI->getOperand(0).getReg(),
 238                                                     MI->getOperand(1).getReg());
 239     TII->addFlag(NewMI, 0, MO_FLAG_NEG);
 240     break;
 241   }
 242
 243   case AMDGPU::MASK_WRITE: {
 244     unsigned maskedRegister = MI->getOperand(0).getReg();
 245     assert(TargetRegisterInfo::isVirtualRegister(maskedRegister));
 246     MachineInstr * defInstr = MRI.getVRegDef(maskedRegister);
 247     TII->addFlag(defInstr, 0, MO_FLAG_MASK);
 248     break;
 249   }
 250
 251   case AMDGPU::MOV_IMM_F32:
 252     TII->buildMovImm(*BB, I, MI->getOperand(0).getReg(),
 253                      MI->getOperand(1).getFPImm()->getValueAPF()
 254                          .bitcastToAPInt().getZExtValue());
 255     break;
 256   case AMDGPU::MOV_IMM_I32:
 257     TII->buildMovImm(*BB, I, MI->getOperand(0).getReg(),
 258                      MI->getOperand(1).getImm());
 259     break;
 260   case AMDGPU::CONST_COPY: {
 261     MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, MI, AMDGPU::MOV,
 262         MI->getOperand(0).getReg(), AMDGPU::ALU_CONST);
 263     TII->setImmOperand(NewMI, AMDGPU::OpName::src0_sel,
 264         MI->getOperand(1).getImm());
 265     break;
 266   }
 267
 268   case AMDGPU::RAT_WRITE_CACHELESS_32_eg:
 269   case AMDGPU::RAT_WRITE_CACHELESS_64_eg:
 270   case AMDGPU::RAT_WRITE_CACHELESS_128_eg: {
 271     unsigned EOP = (std::next(I)->getOpcode() == AMDGPU::RETURN) ? 1 : 0;
 272
 273     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode()))
 274             .addOperand(MI->getOperand(0))
 275             .addOperand(MI->getOperand(1))
 276             .addImm(EOP); // Set End of program bit
 277     break;
 278   }
 279
 280   case AMDGPU::TXD: {
 281     unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
 282     unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
 283     MachineOperand &RID = MI->getOperand(4);
 284     MachineOperand &SID = MI->getOperand(5);
 285     unsigned TextureId = MI->getOperand(6).getImm();
 286     unsigned SrcX = 0, SrcY = 1, SrcZ = 2, SrcW = 3;
 287     unsigned CTX = 1, CTY = 1, CTZ = 1, CTW = 1;
 288
 289     switch (TextureId) {
 290     case 5: // Rect
 291       CTX = CTY = 0;
 292       break;
 293     case 6: // Shadow1D
 294       SrcW = SrcZ;
 295       break;
 296     case 7: // Shadow2D
 297       SrcW = SrcZ;
 298       break;
 299     case 8: // ShadowRect
 300       CTX = CTY = 0;
 301       SrcW = SrcZ;
 302       break;
 303     case 9: // 1DArray
 304       SrcZ = SrcY;
 305       CTZ = 0;
 306       break;
 307     case 10: // 2DArray
 308       CTZ = 0;
 309       break;
 310     case 11: // Shadow1DArray
 311       SrcZ = SrcY;
 312       CTZ = 0;
 313       break;
 314     case 12: // Shadow2DArray
 315       CTZ = 0;
 316       break;
 317     }
 318     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), T0)
 319             .addOperand(MI->getOperand(3))
 320             .addImm(SrcX)
 321             .addImm(SrcY)
 322             .addImm(SrcZ)
 323             .addImm(SrcW)
 324             .addImm(0)
 325             .addImm(0)
 326             .addImm(0)
 327             .addImm(0)
 328             .addImm(1)
 329             .addImm(2)
 330             .addImm(3)
 331             .addOperand(RID)
 332             .addOperand(SID)
 333             .addImm(CTX)
 334             .addImm(CTY)
 335             .addImm(CTZ)
 336             .addImm(CTW);
 337     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), T1)
 338             .addOperand(MI->getOperand(2))
 339             .addImm(SrcX)
 340             .addImm(SrcY)
 341             .addImm(SrcZ)
 342             .addImm(SrcW)
 343             .addImm(0)
 344             .addImm(0)
 345             .addImm(0)
 346             .addImm(0)
 347             .addImm(1)
 348             .addImm(2)
 349             .addImm(3)
 350             .addOperand(RID)
 351             .addOperand(SID)
 352             .addImm(CTX)
 353             .addImm(CTY)
 354             .addImm(CTZ)
 355             .addImm(CTW);
 356     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_G))
 357             .addOperand(MI->getOperand(0))
 358             .addOperand(MI->getOperand(1))
 359             .addImm(SrcX)
 360             .addImm(SrcY)
 361             .addImm(SrcZ)
 362             .addImm(SrcW)
 363             .addImm(0)
 364             .addImm(0)
 365             .addImm(0)
 366             .addImm(0)
 367             .addImm(1)
 368             .addImm(2)
 369             .addImm(3)
 370             .addOperand(RID)
 371             .addOperand(SID)
 372             .addImm(CTX)
 373             .addImm(CTY)
 374             .addImm(CTZ)
 375             .addImm(CTW)
 376             .addReg(T0, RegState::Implicit)
 377             .addReg(T1, RegState::Implicit);
 378     break;
 379   }
 380
 381   case AMDGPU::TXD_SHADOW: {
 382     unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
 383     unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
 384     MachineOperand &RID = MI->getOperand(4);
 385     MachineOperand &SID = MI->getOperand(5);
 386     unsigned TextureId = MI->getOperand(6).getImm();
 387     unsigned SrcX = 0, SrcY = 1, SrcZ = 2, SrcW = 3;
 388     unsigned CTX = 1, CTY = 1, CTZ = 1, CTW = 1;
 389
 390     switch (TextureId) {
 391     case 5: // Rect
 392       CTX = CTY = 0;
 393       break;
 394     case 6: // Shadow1D
 395       SrcW = SrcZ;
 396       break;
 397     case 7: // Shadow2D
 398       SrcW = SrcZ;
 399       break;
 400     case 8: // ShadowRect
 401       CTX = CTY = 0;
 402       SrcW = SrcZ;
 403       break;
 404     case 9: // 1DArray
 405       SrcZ = SrcY;
 406       CTZ = 0;
 407       break;
 408     case 10: // 2DArray
 409       CTZ = 0;
 410       break;
 411     case 11: // Shadow1DArray
 412       SrcZ = SrcY;
 413       CTZ = 0;
 414       break;
 415     case 12: // Shadow2DArray
 416       CTZ = 0;
 417       break;
 418     }
 419
 420     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), T0)
 421             .addOperand(MI->getOperand(3))
 422             .addImm(SrcX)
 423             .addImm(SrcY)
 424             .addImm(SrcZ)
 425             .addImm(SrcW)
 426             .addImm(0)
 427             .addImm(0)
 428             .addImm(0)
 429             .addImm(0)
 430             .addImm(1)
 431             .addImm(2)
 432             .addImm(3)
 433             .addOperand(RID)
 434             .addOperand(SID)
 435             .addImm(CTX)
 436             .addImm(CTY)
 437             .addImm(CTZ)
 438             .addImm(CTW);
 439     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), T1)
 440             .addOperand(MI->getOperand(2))
 441             .addImm(SrcX)
 442             .addImm(SrcY)
 443             .addImm(SrcZ)
 444             .addImm(SrcW)
 445             .addImm(0)
 446             .addImm(0)
 447             .addImm(0)
 448             .addImm(0)
 449             .addImm(1)
 450             .addImm(2)
 451             .addImm(3)
 452             .addOperand(RID)
 453             .addOperand(SID)
 454             .addImm(CTX)
 455             .addImm(CTY)
 456             .addImm(CTZ)
 457             .addImm(CTW);
 458     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_C_G))
 459             .addOperand(MI->getOperand(0))
 460             .addOperand(MI->getOperand(1))
 461             .addImm(SrcX)
 462             .addImm(SrcY)
 463             .addImm(SrcZ)
 464             .addImm(SrcW)
 465             .addImm(0)
 466             .addImm(0)
 467             .addImm(0)
 468             .addImm(0)
 469             .addImm(1)
 470             .addImm(2)
 471             .addImm(3)
 472             .addOperand(RID)
 473             .addOperand(SID)
 474             .addImm(CTX)
 475             .addImm(CTY)
 476             .addImm(CTZ)
 477             .addImm(CTW)
 478             .addReg(T0, RegState::Implicit)
 479             .addReg(T1, RegState::Implicit);
 480     break;
 481   }
 482
 483   case AMDGPU::BRANCH:
 484       BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP))
 485               .addOperand(MI->getOperand(0));
 486       break;
 487
 488   case AMDGPU::BRANCH_COND_f32: {
 489     MachineInstr *NewMI =
 490       BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X),
 491               AMDGPU::PREDICATE_BIT)
 492               .addOperand(MI->getOperand(1))
 493               .addImm(OPCODE_IS_NOT_ZERO)
 494               .addImm(0); // Flags
 495     TII->addFlag(NewMI, 0, MO_FLAG_PUSH);
 496     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND))
 497             .addOperand(MI->getOperand(0))
 498             .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
 499     break;
 500   }
 501
 502   case AMDGPU::BRANCH_COND_i32: {
 503     MachineInstr *NewMI =
 504       BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X),
 505             AMDGPU::PREDICATE_BIT)
 506             .addOperand(MI->getOperand(1))
 507             .addImm(OPCODE_IS_NOT_ZERO_INT)
 508             .addImm(0); // Flags
 509     TII->addFlag(NewMI, 0, MO_FLAG_PUSH);
 510     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND))
 511            .addOperand(MI->getOperand(0))
 512             .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
 513     break;
 514   }
 515
 516   case AMDGPU::EG_ExportSwz:
 517   case AMDGPU::R600_ExportSwz: {
 518     // Instruction is left unmodified if its not the last one of its type
 519     bool isLastInstructionOfItsType = true;
 520     unsigned InstExportType = MI->getOperand(1).getImm();
 521     for (MachineBasicBlock::iterator NextExportInst = std::next(I),
 522          EndBlock = BB->end(); NextExportInst != EndBlock;
 523          NextExportInst = std::next(NextExportInst)) {
 524       if (NextExportInst->getOpcode() == AMDGPU::EG_ExportSwz ||
 525           NextExportInst->getOpcode() == AMDGPU::R600_ExportSwz) {
 526         unsigned CurrentInstExportType = NextExportInst->getOperand(1)
 527             .getImm();
 528         if (CurrentInstExportType == InstExportType) {
 529           isLastInstructionOfItsType = false;
 530           break;
 531         }
 532       }
 533     }
 534     bool EOP = (std::next(I)->getOpcode() == AMDGPU::RETURN) ? 1 : 0;
 535     if (!EOP && !isLastInstructionOfItsType)
 536       return BB;
 537     unsigned CfInst = (MI->getOpcode() == AMDGPU::EG_ExportSwz)? 84 : 40;
 538     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode()))
 539             .addOperand(MI->getOperand(0))
 540             .addOperand(MI->getOperand(1))
 541             .addOperand(MI->getOperand(2))
 542             .addOperand(MI->getOperand(3))
 543             .addOperand(MI->getOperand(4))
 544             .addOperand(MI->getOperand(5))
 545             .addOperand(MI->getOperand(6))
 546             .addImm(CfInst)
 547             .addImm(EOP);
 548     break;
 549   }
 550   case AMDGPU::RETURN: {
 551     // RETURN instructions must have the live-out registers as implicit uses,
 552     // otherwise they appear dead.
 553     R600MachineFunctionInfo *MFI = MF->getInfo<R600MachineFunctionInfo>();
 554     MachineInstrBuilder MIB(*MF, MI);
 555     for (unsigned i = 0, e = MFI->LiveOuts.size(); i != e; ++i)
 556       MIB.addReg(MFI->LiveOuts[i], RegState::Implicit);
 557     return BB;
 558   }
 559   }
 560
 561   MI->eraseFromParent();
 562   return BB;
 563 }
 564
 565 //===----------------------------------------------------------------------===//
 566 // Custom DAG Lowering Operations
 567 //===----------------------------------------------------------------------===//
 568
 569 SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
 570   MachineFunction &MF = DAG.getMachineFunction();
 571   R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
 572   switch (Op.getOpcode()) {
 573   default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
 574   case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
 575   case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
 576   case ISD::SHL_PARTS: return LowerSHLParts(Op, DAG);
 577   case ISD::SRA_PARTS:
 578   case ISD::SRL_PARTS: return LowerSRXParts(Op, DAG);
 579   case ISD::FCOS:
 580   case ISD::FSIN: return LowerTrig(Op, DAG);
 581   case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
 582   case ISD::STORE: return LowerSTORE(Op, DAG);
 583   case ISD::LOAD: {
 584     SDValue Result = LowerLOAD(Op, DAG);
 585     assert((!Result.getNode() ||
 586             Result.getNode()->getNumValues() == 2) &&
 587            "Load should return a value and a chain");
 588     return Result;
 589   }
 590
 591   case ISD::BRCOND: return LowerBRCOND(Op, DAG);
 592   case ISD::GlobalAddress: return LowerGlobalAddress(MFI, Op, DAG);
 593   case ISD::INTRINSIC_VOID: {
 594     SDValue Chain = Op.getOperand(0);
 595     unsigned IntrinsicID =
 596                          cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
 597     switch (IntrinsicID) {
 598     case AMDGPUIntrinsic::AMDGPU_store_output: {
 599       int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
 600       unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister(RegIndex);
 601       MFI->LiveOuts.push_back(Reg);
 602       return DAG.getCopyToReg(Chain, SDLoc(Op), Reg, Op.getOperand(2));
 603     }
 604     case AMDGPUIntrinsic::R600_store_swizzle: {
 605       const SDValue Args[8] = {
 606         Chain,
 607         Op.getOperand(2), // Export Value
 608         Op.getOperand(3), // ArrayBase
 609         Op.getOperand(4), // Type
 610         DAG.getConstant(0, MVT::i32), // SWZ_X
 611         DAG.getConstant(1, MVT::i32), // SWZ_Y
 612         DAG.getConstant(2, MVT::i32), // SWZ_Z
 613         DAG.getConstant(3, MVT::i32) // SWZ_W
 614       };
 615       return DAG.getNode(AMDGPUISD::EXPORT, SDLoc(Op), Op.getValueType(), Args);
 616     }
 617
 618     // default for switch(IntrinsicID)
 619     default: break;
 620     }
 621     // break out of case ISD::INTRINSIC_VOID in switch(Op.getOpcode())
 622     break;
 623   }
 624   case ISD::INTRINSIC_WO_CHAIN: {
 625     unsigned IntrinsicID =
 626                          cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
 627     EVT VT = Op.getValueType();
 628     SDLoc DL(Op);
 629     switch(IntrinsicID) {
 630     default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
 631     case AMDGPUIntrinsic::R600_load_input: {
 632       int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
 633       unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister(RegIndex);
 634       MachineFunction &MF = DAG.getMachineFunction();
 635       MachineRegisterInfo &MRI = MF.getRegInfo();
 636       MRI.addLiveIn(Reg);
 637       return DAG.getCopyFromReg(DAG.getEntryNode(),
 638           SDLoc(DAG.getEntryNode()), Reg, VT);
 639     }
 640
 641     case AMDGPUIntrinsic::R600_interp_input: {
 642       int slot = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
 643       int ijb = cast<ConstantSDNode>(Op.getOperand(2))->getSExtValue();
 644       MachineSDNode *interp;
 645       if (ijb < 0) {
 646         const MachineFunction &MF = DAG.getMachineFunction();
 647         const R600InstrInfo *TII =
 648           static_cast<const R600InstrInfo*>(MF.getTarget().getInstrInfo());
 649         interp = DAG.getMachineNode(AMDGPU::INTERP_VEC_LOAD, DL,
 650             MVT::v4f32, DAG.getTargetConstant(slot / 4 , MVT::i32));
 651         return DAG.getTargetExtractSubreg(
 652             TII->getRegisterInfo().getSubRegFromChannel(slot % 4),
 653             DL, MVT::f32, SDValue(interp, 0));
 654       }
 655       MachineFunction &MF = DAG.getMachineFunction();
 656       MachineRegisterInfo &MRI = MF.getRegInfo();
 657       unsigned RegisterI = AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb);
 658       unsigned RegisterJ = AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb + 1);
 659       MRI.addLiveIn(RegisterI);
 660       MRI.addLiveIn(RegisterJ);
 661       SDValue RegisterINode = DAG.getCopyFromReg(DAG.getEntryNode(),
 662           SDLoc(DAG.getEntryNode()), RegisterI, MVT::f32);
 663       SDValue RegisterJNode = DAG.getCopyFromReg(DAG.getEntryNode(),
 664           SDLoc(DAG.getEntryNode()), RegisterJ, MVT::f32);
 665
 666       if (slot % 4 < 2)
 667         interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_XY, DL,
 668             MVT::f32, MVT::f32, DAG.getTargetConstant(slot / 4 , MVT::i32),
 669             RegisterJNode, RegisterINode);
 670       else
 671         interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_ZW, DL,
 672             MVT::f32, MVT::f32, DAG.getTargetConstant(slot / 4 , MVT::i32),
 673             RegisterJNode, RegisterINode);
 674       return SDValue(interp, slot % 2);
 675     }
 676     case AMDGPUIntrinsic::R600_interp_xy:
 677     case AMDGPUIntrinsic::R600_interp_zw: {
 678       int slot = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
 679       MachineSDNode *interp;
 680       SDValue RegisterINode = Op.getOperand(2);
 681       SDValue RegisterJNode = Op.getOperand(3);
 682
 683       if (IntrinsicID == AMDGPUIntrinsic::R600_interp_xy)
 684         interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_XY, DL,
 685             MVT::f32, MVT::f32, DAG.getTargetConstant(slot, MVT::i32),
 686             RegisterJNode, RegisterINode);
 687       else
 688         interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_ZW, DL,
 689             MVT::f32, MVT::f32, DAG.getTargetConstant(slot, MVT::i32),
 690             RegisterJNode, RegisterINode);
 691       return DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2f32,
 692           SDValue(interp, 0), SDValue(interp, 1));
 693     }
 694     case AMDGPUIntrinsic::R600_tex:
 695     case AMDGPUIntrinsic::R600_texc:
 696     case AMDGPUIntrinsic::R600_txl:
 697     case AMDGPUIntrinsic::R600_txlc:
 698     case AMDGPUIntrinsic::R600_txb:
 699     case AMDGPUIntrinsic::R600_txbc:
 700     case AMDGPUIntrinsic::R600_txf:
 701     case AMDGPUIntrinsic::R600_txq:
 702     case AMDGPUIntrinsic::R600_ddx:
 703     case AMDGPUIntrinsic::R600_ddy:
 704     case AMDGPUIntrinsic::R600_ldptr: {
 705       unsigned TextureOp;
 706       switch (IntrinsicID) {
 707       case AMDGPUIntrinsic::R600_tex:
 708         TextureOp = 0;
 709         break;
 710       case AMDGPUIntrinsic::R600_texc:
 711         TextureOp = 1;
 712         break;
 713       case AMDGPUIntrinsic::R600_txl:
 714         TextureOp = 2;
 715         break;
 716       case AMDGPUIntrinsic::R600_txlc:
 717         TextureOp = 3;
 718         break;
 719       case AMDGPUIntrinsic::R600_txb:
 720         TextureOp = 4;
 721         break;
 722       case AMDGPUIntrinsic::R600_txbc:
 723         TextureOp = 5;
 724         break;
 725       case AMDGPUIntrinsic::R600_txf:
 726         TextureOp = 6;
 727         break;
 728       case AMDGPUIntrinsic::R600_txq:
 729         TextureOp = 7;
 730         break;
 731       case AMDGPUIntrinsic::R600_ddx:
 732         TextureOp = 8;
 733         break;
 734       case AMDGPUIntrinsic::R600_ddy:
 735         TextureOp = 9;
 736         break;
 737       case AMDGPUIntrinsic::R600_ldptr:
 738         TextureOp = 10;
 739         break;
 740       default:
 741         llvm_unreachable("Unknow Texture Operation");
 742       }
 743
 744       SDValue TexArgs[19] = {
 745         DAG.getConstant(TextureOp, MVT::i32),
 746         Op.getOperand(1),
 747         DAG.getConstant(0, MVT::i32),
 748         DAG.getConstant(1, MVT::i32),
 749         DAG.getConstant(2, MVT::i32),
 750         DAG.getConstant(3, MVT::i32),
 751         Op.getOperand(2),
 752         Op.getOperand(3),
 753         Op.getOperand(4),
 754         DAG.getConstant(0, MVT::i32),
 755         DAG.getConstant(1, MVT::i32),
 756         DAG.getConstant(2, MVT::i32),
 757         DAG.getConstant(3, MVT::i32),
 758         Op.getOperand(5),
 759         Op.getOperand(6),
 760         Op.getOperand(7),
 761         Op.getOperand(8),
 762         Op.getOperand(9),
 763         Op.getOperand(10)
 764       };
 765       return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, DL, MVT::v4f32, TexArgs);
 766     }
 767     case AMDGPUIntrinsic::AMDGPU_dp4: {
 768       SDValue Args[8] = {
 769       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
 770           DAG.getConstant(0, MVT::i32)),
 771       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
 772           DAG.getConstant(0, MVT::i32)),
 773       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
 774           DAG.getConstant(1, MVT::i32)),
 775       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
 776           DAG.getConstant(1, MVT::i32)),
 777       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
 778           DAG.getConstant(2, MVT::i32)),
 779       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
 780           DAG.getConstant(2, MVT::i32)),
 781       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
 782           DAG.getConstant(3, MVT::i32)),
 783       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
 784           DAG.getConstant(3, MVT::i32))
 785       };
 786       return DAG.getNode(AMDGPUISD::DOT4, DL, MVT::f32, Args);
 787     }
 788
 789     case Intrinsic::r600_read_ngroups_x:
 790       return LowerImplicitParameter(DAG, VT, DL, 0);
 791     case Intrinsic::r600_read_ngroups_y:
 792       return LowerImplicitParameter(DAG, VT, DL, 1);
 793     case Intrinsic::r600_read_ngroups_z:
 794       return LowerImplicitParameter(DAG, VT, DL, 2);
 795     case Intrinsic::r600_read_global_size_x:
 796       return LowerImplicitParameter(DAG, VT, DL, 3);
 797     case Intrinsic::r600_read_global_size_y:
 798       return LowerImplicitParameter(DAG, VT, DL, 4);
 799     case Intrinsic::r600_read_global_size_z:
 800       return LowerImplicitParameter(DAG, VT, DL, 5);
 801     case Intrinsic::r600_read_local_size_x:
 802       return LowerImplicitParameter(DAG, VT, DL, 6);
 803     case Intrinsic::r600_read_local_size_y:
 804       return LowerImplicitParameter(DAG, VT, DL, 7);
 805     case Intrinsic::r600_read_local_size_z:
 806       return LowerImplicitParameter(DAG, VT, DL, 8);
 807
 808     case Intrinsic::r600_read_tgid_x:
 809       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 810                                   AMDGPU::T1_X, VT);
 811     case Intrinsic::r600_read_tgid_y:
 812       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 813                                   AMDGPU::T1_Y, VT);
 814     case Intrinsic::r600_read_tgid_z:
 815       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 816                                   AMDGPU::T1_Z, VT);
 817     case Intrinsic::r600_read_tidig_x:
 818       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 819                                   AMDGPU::T0_X, VT);
 820     case Intrinsic::r600_read_tidig_y:
 821       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 822                                   AMDGPU::T0_Y, VT);
 823     case Intrinsic::r600_read_tidig_z:
 824       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 825                                   AMDGPU::T0_Z, VT);
 826     case Intrinsic::AMDGPU_rsq:
 827       // XXX - I'm assuming SI's RSQ_LEGACY matches R600's behavior.
 828       return DAG.getNode(AMDGPUISD::RSQ_LEGACY, DL, VT, Op.getOperand(1));
 829     }
 830     // break out of case ISD::INTRINSIC_WO_CHAIN in switch(Op.getOpcode())
 831     break;
 832   }
 833   } // end switch(Op.getOpcode())
 834   return SDValue();
 835 }
 836
 837 void R600TargetLowering::ReplaceNodeResults(SDNode *N,
 838                                             SmallVectorImpl<SDValue> &Results,
 839                                             SelectionDAG &DAG) const {
 840   switch (N->getOpcode()) {
 841   default:
 842     AMDGPUTargetLowering::ReplaceNodeResults(N, Results, DAG);
 843     return;
 844   case ISD::FP_TO_UINT:
 845     if (N->getValueType(0) == MVT::i1) {
 846       Results.push_back(LowerFPTOUINT(N->getOperand(0), DAG));
 847       return;
 848     }
 849     // Fall-through. Since we don't care about out of bounds values
 850     // we can use FP_TO_SINT for uints too. The DAGLegalizer code for uint
 851     // considers some extra cases which are not necessary here.
 852   case ISD::FP_TO_SINT: {
 853     SDValue Result;
 854     if (expandFP_TO_SINT(N, Result, DAG))
 855       Results.push_back(Result);
 856     return;
 857   }
 858   case ISD::UDIV: {
 859     SDValue Op = SDValue(N, 0);
 860     SDLoc DL(Op);
 861     EVT VT = Op.getValueType();
 862     SDValue UDIVREM = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(VT, VT),
 863       N->getOperand(0), N->getOperand(1));
 864     Results.push_back(UDIVREM);
 865     break;
 866   }
 867   case ISD::UREM: {
 868     SDValue Op = SDValue(N, 0);
 869     SDLoc DL(Op);
 870     EVT VT = Op.getValueType();
 871     SDValue UDIVREM = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(VT, VT),
 872       N->getOperand(0), N->getOperand(1));
 873     Results.push_back(UDIVREM.getValue(1));
 874     break;
 875   }
 876   case ISD::SDIV: {
 877     SDValue Op = SDValue(N, 0);
 878     SDLoc DL(Op);
 879     EVT VT = Op.getValueType();
 880     SDValue SDIVREM = DAG.getNode(ISD::SDIVREM, DL, DAG.getVTList(VT, VT),
 881       N->getOperand(0), N->getOperand(1));
 882     Results.push_back(SDIVREM);
 883     break;
 884   }
 885   case ISD::SREM: {
 886     SDValue Op = SDValue(N, 0);
 887     SDLoc DL(Op);
 888     EVT VT = Op.getValueType();
 889     SDValue SDIVREM = DAG.getNode(ISD::SDIVREM, DL, DAG.getVTList(VT, VT),
 890       N->getOperand(0), N->getOperand(1));
 891     Results.push_back(SDIVREM.getValue(1));
 892     break;
 893   }
 894   case ISD::SDIVREM: {
 895     SDValue Op = SDValue(N, 1);
 896     SDValue RES = LowerSDIVREM(Op, DAG);
 897     Results.push_back(RES);
 898     Results.push_back(RES.getValue(1));
 899     break;
 900   }
 901   case ISD::UDIVREM: {
 902     SDValue Op = SDValue(N, 0);
 903     SDLoc DL(Op);
 904     EVT VT = Op.getValueType();
 905     EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());
 906
 907     SDValue one = DAG.getConstant(1, HalfVT);
 908     SDValue zero = DAG.getConstant(0, HalfVT);
 909
 910     //HiLo split
 911     SDValue LHS = N->getOperand(0);
 912     SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, zero);
 913     SDValue LHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, one);
 914
 915     SDValue RHS = N->getOperand(1);
 916     SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, zero);
 917     SDValue RHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, one);
 918
 919     // Get Speculative values
 920     SDValue DIV_Part = DAG.getNode(ISD::UDIV, DL, HalfVT, LHS_Hi, RHS_Lo);
 921     SDValue REM_Part = DAG.getNode(ISD::UREM, DL, HalfVT, LHS_Hi, RHS_Lo);
 922
 923     SDValue REM_Hi = zero;
 924     SDValue REM_Lo = DAG.getSelectCC(DL, RHS_Hi, zero, REM_Part, LHS_Hi, ISD::SETEQ);
 925
 926     SDValue DIV_Hi = DAG.getSelectCC(DL, RHS_Hi, zero, DIV_Part, zero, ISD::SETEQ);
 927     SDValue DIV_Lo = zero;
 928
 929     const unsigned halfBitWidth = HalfVT.getSizeInBits();
 930
 931     for (unsigned i = 0; i < halfBitWidth; ++i) {
 932       SDValue POS = DAG.getConstant(halfBitWidth - i - 1, HalfVT);
 933       // Get Value of high bit
 934       SDValue HBit;
 935       if (halfBitWidth == 32 && Subtarget->hasBFE()) {
 936         HBit = DAG.getNode(AMDGPUISD::BFE_U32, DL, HalfVT, LHS_Lo, POS, one);
 937       } else {
 938         HBit = DAG.getNode(ISD::SRL, DL, HalfVT, LHS_Lo, POS);
 939         HBit = DAG.getNode(ISD::AND, DL, HalfVT, HBit, one);
 940       }
 941
 942       SDValue Carry = DAG.getNode(ISD::SRL, DL, HalfVT, REM_Lo,
 943         DAG.getConstant(halfBitWidth - 1, HalfVT));
 944       REM_Hi = DAG.getNode(ISD::SHL, DL, HalfVT, REM_Hi, one);
 945       REM_Hi = DAG.getNode(ISD::OR, DL, HalfVT, REM_Hi, Carry);
 946
 947       REM_Lo = DAG.getNode(ISD::SHL, DL, HalfVT, REM_Lo, one);
 948       REM_Lo = DAG.getNode(ISD::OR, DL, HalfVT, REM_Lo, HBit);
 949
 950
 951       SDValue REM = DAG.getNode(ISD::BUILD_PAIR, DL, VT, REM_Lo, REM_Hi);
 952
 953       SDValue BIT = DAG.getConstant(1 << (halfBitWidth - i - 1), HalfVT);
 954       SDValue realBIT = DAG.getSelectCC(DL, REM, RHS, BIT, zero, ISD::SETGE);
 955
 956       DIV_Lo = DAG.getNode(ISD::OR, DL, HalfVT, DIV_Lo, realBIT);
 957
 958       // Update REM
 959
 960       SDValue REM_sub = DAG.getNode(ISD::SUB, DL, VT, REM, RHS);
 961
 962       REM = DAG.getSelectCC(DL, REM, RHS, REM_sub, REM, ISD::SETGE);
 963       REM_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, REM, zero);
 964       REM_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, REM, one);
 965     }
 966
 967     SDValue REM = DAG.getNode(ISD::BUILD_PAIR, DL, VT, REM_Lo, REM_Hi);
 968     SDValue DIV = DAG.getNode(ISD::BUILD_PAIR, DL, VT, DIV_Lo, DIV_Hi);
 969     Results.push_back(DIV);
 970     Results.push_back(REM);
 971     break;
 972   }
 973   }
 974 }
 975
 976 SDValue R600TargetLowering::vectorToVerticalVector(SelectionDAG &DAG,
 977                                                    SDValue Vector) const {
 978
 979   SDLoc DL(Vector);
 980   EVT VecVT = Vector.getValueType();
 981   EVT EltVT = VecVT.getVectorElementType();
 982   SmallVector<SDValue, 8> Args;
 983
 984   for (unsigned i = 0, e = VecVT.getVectorNumElements();
 985                                                            i != e; ++i) {
 986     Args.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT,
 987                                Vector, DAG.getConstant(i, getVectorIdxTy())));
 988   }
 989
 990   return DAG.getNode(AMDGPUISD::BUILD_VERTICAL_VECTOR, DL, VecVT, Args);
 991 }
 992
 993 SDValue R600TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
 994                                                     SelectionDAG &DAG) const {
 995
 996   SDLoc DL(Op);
 997   SDValue Vector = Op.getOperand(0);
 998   SDValue Index = Op.getOperand(1);
 999
1000   if (isa<ConstantSDNode>(Index) ||
1001       Vector.getOpcode() == AMDGPUISD::BUILD_VERTICAL_VECTOR)
1002     return Op;
1003
1004   Vector = vectorToVerticalVector(DAG, Vector);
1005   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Op.getValueType(),
1006                      Vector, Index);
1007 }
1008
1009 SDValue R600TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
1010                                                    SelectionDAG &DAG) const {
1011   SDLoc DL(Op);
1012   SDValue Vector = Op.getOperand(0);
1013   SDValue Value = Op.getOperand(1);
1014   SDValue Index = Op.getOperand(2);
1015
1016   if (isa<ConstantSDNode>(Index) ||
1017       Vector.getOpcode() == AMDGPUISD::BUILD_VERTICAL_VECTOR)
1018     return Op;
1019
1020   Vector = vectorToVerticalVector(DAG, Vector);
1021   SDValue Insert = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, Op.getValueType(),
1022                                Vector, Value, Index);
1023   return vectorToVerticalVector(DAG, Insert);
1024 }
1025
1026 SDValue R600TargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
1027   // On hw >= R700, COS/SIN input must be between -1. and 1.
1028   // Thus we lower them to TRIG ( FRACT ( x / 2Pi + 0.5) - 0.5)
1029   EVT VT = Op.getValueType();
1030   SDValue Arg = Op.getOperand(0);
1031   SDValue FractPart = DAG.getNode(AMDGPUISD::FRACT, SDLoc(Op), VT,
1032       DAG.getNode(ISD::FADD, SDLoc(Op), VT,
1033         DAG.getNode(ISD::FMUL, SDLoc(Op), VT, Arg,
1034           DAG.getConstantFP(0.15915494309, MVT::f32)),
1035         DAG.getConstantFP(0.5, MVT::f32)));
1036   unsigned TrigNode;
1037   switch (Op.getOpcode()) {
1038   case ISD::FCOS:
1039     TrigNode = AMDGPUISD::COS_HW;
1040     break;
1041   case ISD::FSIN:
1042     TrigNode = AMDGPUISD::SIN_HW;
1043     break;
1044   default:
1045     llvm_unreachable("Wrong trig opcode");
1046   }
1047   SDValue TrigVal = DAG.getNode(TrigNode, SDLoc(Op), VT,
1048       DAG.getNode(ISD::FADD, SDLoc(Op), VT, FractPart,
1049         DAG.getConstantFP(-0.5, MVT::f32)));
1050   if (Gen >= AMDGPUSubtarget::R700)
1051     return TrigVal;
1052   // On R600 hw, COS/SIN input must be between -Pi and Pi.
1053   return DAG.getNode(ISD::FMUL, SDLoc(Op), VT, TrigVal,
1054       DAG.getConstantFP(3.14159265359, MVT::f32));
1055 }
1056
1057 SDValue R600TargetLowering::LowerSHLParts(SDValue Op, SelectionDAG &DAG) const {
1058   SDLoc DL(Op);
1059   EVT VT = Op.getValueType();
1060
1061   SDValue Lo = Op.getOperand(0);
1062   SDValue Hi = Op.getOperand(1);
1063   SDValue Shift = Op.getOperand(2);
1064   SDValue Zero = DAG.getConstant(0, VT);
1065   SDValue One  = DAG.getConstant(1, VT);
1066
1067   SDValue Width  = DAG.getConstant(VT.getSizeInBits(), VT);
1068   SDValue Width1 = DAG.getConstant(VT.getSizeInBits() - 1, VT);
1069   SDValue BigShift  = DAG.getNode(ISD::SUB, DL, VT, Shift, Width);
1070   SDValue CompShift = DAG.getNode(ISD::SUB, DL, VT, Width1, Shift);
1071
1072   // The dance around Width1 is necessary for 0 special case.
1073   // Without it the CompShift might be 32, producing incorrect results in
1074   // Overflow. So we do the shift in two steps, the alternative is to
1075   // add a conditional to filter the special case.
1076
1077   SDValue Overflow = DAG.getNode(ISD::SRL, DL, VT, Lo, CompShift);
1078   Overflow = DAG.getNode(ISD::SRL, DL, VT, Overflow, One);
1079
1080   SDValue HiSmall = DAG.getNode(ISD::SHL, DL, VT, Hi, Shift);
1081   HiSmall = DAG.getNode(ISD::OR, DL, VT, HiSmall, Overflow);
1082   SDValue LoSmall = DAG.getNode(ISD::SHL, DL, VT, Lo, Shift);
1083
1084   SDValue HiBig = DAG.getNode(ISD::SHL, DL, VT, Lo, BigShift);
1085   SDValue LoBig = Zero;
1086
1087   Hi = DAG.getSelectCC(DL, Shift, Width, HiSmall, HiBig, ISD::SETULT);
1088   Lo = DAG.getSelectCC(DL, Shift, Width, LoSmall, LoBig, ISD::SETULT);
1089
1090   return DAG.getNode(ISD::MERGE_VALUES, DL, DAG.getVTList(VT,VT), Lo, Hi);
1091 }
1092
1093 SDValue R600TargetLowering::LowerSRXParts(SDValue Op, SelectionDAG &DAG) const {
1094   SDLoc DL(Op);
1095   EVT VT = Op.getValueType();
1096
1097   SDValue Lo = Op.getOperand(0);
1098   SDValue Hi = Op.getOperand(1);
1099   SDValue Shift = Op.getOperand(2);
1100   SDValue Zero = DAG.getConstant(0, VT);
1101   SDValue One  = DAG.getConstant(1, VT);
1102
1103   const bool SRA = Op.getOpcode() == ISD::SRA_PARTS;
1104
1105   SDValue Width  = DAG.getConstant(VT.getSizeInBits(), VT);
1106   SDValue Width1 = DAG.getConstant(VT.getSizeInBits() - 1, VT);
1107   SDValue BigShift  = DAG.getNode(ISD::SUB, DL, VT, Shift, Width);
1108   SDValue CompShift = DAG.getNode(ISD::SUB, DL, VT, Width1, Shift);
1109
1110   // The dance around Width1 is necessary for 0 special case.
1111   // Without it the CompShift might be 32, producing incorrect results in
1112   // Overflow. So we do the shift in two steps, the alternative is to
1113   // add a conditional to filter the special case.
1114
1115   SDValue Overflow = DAG.getNode(ISD::SHL, DL, VT, Hi, CompShift);
1116   Overflow = DAG.getNode(ISD::SHL, DL, VT, Overflow, One);
1117
1118   SDValue HiSmall = DAG.getNode(SRA ? ISD::SRA : ISD::SRL, DL, VT, Hi, Shift);
1119   SDValue LoSmall = DAG.getNode(ISD::SRL, DL, VT, Lo, Shift);
1120   LoSmall = DAG.getNode(ISD::OR, DL, VT, LoSmall, Overflow);
1121
1122   SDValue LoBig = DAG.getNode(SRA ? ISD::SRA : ISD::SRL, DL, VT, Hi, BigShift);
1123   SDValue HiBig = SRA ? DAG.getNode(ISD::SRA, DL, VT, Hi, Width1) : Zero;
1124
1125   Hi = DAG.getSelectCC(DL, Shift, Width, HiSmall, HiBig, ISD::SETULT);
1126   Lo = DAG.getSelectCC(DL, Shift, Width, LoSmall, LoBig, ISD::SETULT);
1127
1128   return DAG.getNode(ISD::MERGE_VALUES, DL, DAG.getVTList(VT,VT), Lo, Hi);
1129 }
1130
1131 SDValue R600TargetLowering::LowerFPTOUINT(SDValue Op, SelectionDAG &DAG) const {
1132   return DAG.getNode(
1133       ISD::SETCC,
1134       SDLoc(Op),
1135       MVT::i1,
1136       Op, DAG.getConstantFP(0.0f, MVT::f32),
1137       DAG.getCondCode(ISD::SETNE)
1138       );
1139 }
1140
1141 SDValue R600TargetLowering::LowerImplicitParameter(SelectionDAG &DAG, EVT VT,
1142                                                    SDLoc DL,
1143                                                    unsigned DwordOffset) const {
1144   unsigned ByteOffset = DwordOffset * 4;
1145   PointerType * PtrType = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
1146                                       AMDGPUAS::CONSTANT_BUFFER_0);
1147
1148   // We shouldn't be using an offset wider than 16-bits for implicit parameters.
1149   assert(isInt<16>(ByteOffset));
1150
1151   return DAG.getLoad(VT, DL, DAG.getEntryNode(),
1152                      DAG.getConstant(ByteOffset, MVT::i32), // PTR
1153                      MachinePointerInfo(ConstantPointerNull::get(PtrType)),
1154                      false, false, false, 0);
1155 }
1156
1157 bool R600TargetLowering::isZero(SDValue Op) const {
1158   if(ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
1159     return Cst->isNullValue();
1160   } else if(ConstantFPSDNode *CstFP = dyn_cast<ConstantFPSDNode>(Op)){
1161     return CstFP->isZero();
1162   } else {
1163     return false;
1164   }
1165 }
1166
1167 SDValue R600TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
1168   SDLoc DL(Op);
1169   EVT VT = Op.getValueType();
1170
1171   SDValue LHS = Op.getOperand(0);
1172   SDValue RHS = Op.getOperand(1);
1173   SDValue True = Op.getOperand(2);
1174   SDValue False = Op.getOperand(3);
1175   SDValue CC = Op.getOperand(4);
1176   SDValue Temp;
1177
1178   // LHS and RHS are guaranteed to be the same value type
1179   EVT CompareVT = LHS.getValueType();
1180
1181   // Check if we can lower this to a native operation.
1182
1183   // Try to lower to a SET* instruction:
1184   //
1185   // SET* can match the following patterns:
1186   //
1187   // select_cc f32, f32, -1,  0, cc_supported
1188   // select_cc f32, f32, 1.0f, 0.0f, cc_supported
1189   // select_cc i32, i32, -1,  0, cc_supported
1190   //
1191
1192   // Move hardware True/False values to the correct operand.
1193   ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
1194   ISD::CondCode InverseCC =
1195      ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32);
1196   if (isHWTrueValue(False) && isHWFalseValue(True)) {
1197     if (isCondCodeLegal(InverseCC, CompareVT.getSimpleVT())) {
1198       std::swap(False, True);
1199       CC = DAG.getCondCode(InverseCC);
1200     } else {
1201       ISD::CondCode SwapInvCC = ISD::getSetCCSwappedOperands(InverseCC);
1202       if (isCondCodeLegal(SwapInvCC, CompareVT.getSimpleVT())) {
1203         std::swap(False, True);
1204         std::swap(LHS, RHS);
1205         CC = DAG.getCondCode(SwapInvCC);
1206       }
1207     }
1208   }
1209
1210   if (isHWTrueValue(True) && isHWFalseValue(False) &&
1211       (CompareVT == VT || VT == MVT::i32)) {
1212     // This can be matched by a SET* instruction.
1213     return DAG.getNode(ISD::SELECT_CC, DL, VT, LHS, RHS, True, False, CC);
1214   }
1215
1216   // Try to lower to a CND* instruction:
1217   //
1218   // CND* can match the following patterns:
1219   //
1220   // select_cc f32, 0.0, f32, f32, cc_supported
1221   // select_cc f32, 0.0, i32, i32, cc_supported
1222   // select_cc i32, 0,   f32, f32, cc_supported
1223   // select_cc i32, 0,   i32, i32, cc_supported
1224   //
1225
1226   // Try to move the zero value to the RHS
1227   if (isZero(LHS)) {
1228     ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
1229     // Try swapping the operands
1230     ISD::CondCode CCSwapped = ISD::getSetCCSwappedOperands(CCOpcode);
1231     if (isCondCodeLegal(CCSwapped, CompareVT.getSimpleVT())) {
1232       std::swap(LHS, RHS);
1233       CC = DAG.getCondCode(CCSwapped);
1234     } else {
1235       // Try inverting the conditon and then swapping the operands
1236       ISD::CondCode CCInv = ISD::getSetCCInverse(CCOpcode, CompareVT.isInteger());
1237       CCSwapped = ISD::getSetCCSwappedOperands(CCInv);
1238       if (isCondCodeLegal(CCSwapped, CompareVT.getSimpleVT())) {
1239         std::swap(True, False);
1240         std::swap(LHS, RHS);
1241         CC = DAG.getCondCode(CCSwapped);
1242       }
1243     }
1244   }
1245   if (isZero(RHS)) {
1246     SDValue Cond = LHS;
1247     SDValue Zero = RHS;
1248     ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
1249     if (CompareVT != VT) {
1250       // Bitcast True / False to the correct types.  This will end up being
1251       // a nop, but it allows us to define only a single pattern in the
1252       // .TD files for each CND* instruction rather than having to have
1253       // one pattern for integer True/False and one for fp True/False
1254       True = DAG.getNode(ISD::BITCAST, DL, CompareVT, True);
1255       False = DAG.getNode(ISD::BITCAST, DL, CompareVT, False);
1256     }
1257
1258     switch (CCOpcode) {
1259     case ISD::SETONE:
1260     case ISD::SETUNE:
1261     case ISD::SETNE:
1262       CCOpcode = ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32);
1263       Temp = True;
1264       True = False;
1265       False = Temp;
1266       break;
1267     default:
1268       break;
1269     }
1270     SDValue SelectNode = DAG.getNode(ISD::SELECT_CC, DL, CompareVT,
1271         Cond, Zero,
1272         True, False,
1273         DAG.getCondCode(CCOpcode));
1274     return DAG.getNode(ISD::BITCAST, DL, VT, SelectNode);
1275   }
1276
1277   // If we make it this for it means we have no native instructions to handle
1278   // this SELECT_CC, so we must lower it.
1279   SDValue HWTrue, HWFalse;
1280
1281   if (CompareVT == MVT::f32) {
1282     HWTrue = DAG.getConstantFP(1.0f, CompareVT);
1283     HWFalse = DAG.getConstantFP(0.0f, CompareVT);
1284   } else if (CompareVT == MVT::i32) {
1285     HWTrue = DAG.getConstant(-1, CompareVT);
1286     HWFalse = DAG.getConstant(0, CompareVT);
1287   }
1288   else {
1289     llvm_unreachable("Unhandled value type in LowerSELECT_CC");
1290   }
1291
1292   // Lower this unsupported SELECT_CC into a combination of two supported
1293   // SELECT_CC operations.
1294   SDValue Cond = DAG.getNode(ISD::SELECT_CC, DL, CompareVT, LHS, RHS, HWTrue, HWFalse, CC);
1295
1296   return DAG.getNode(ISD::SELECT_CC, DL, VT,
1297       Cond, HWFalse,
1298       True, False,
1299       DAG.getCondCode(ISD::SETNE));
1300 }
1301
1302 /// LLVM generates byte-addressed pointers.  For indirect addressing, we need to
1303 /// convert these pointers to a register index.  Each register holds
1304 /// 16 bytes, (4 x 32bit sub-register), but we need to take into account the
1305 /// \p StackWidth, which tells us how many of the 4 sub-registrers will be used
1306 /// for indirect addressing.
1307 SDValue R600TargetLowering::stackPtrToRegIndex(SDValue Ptr,
1308                                                unsigned StackWidth,
1309                                                SelectionDAG &DAG) const {
1310   unsigned SRLPad;
1311   switch(StackWidth) {
1312   case 1:
1313     SRLPad = 2;
1314     break;
1315   case 2:
1316     SRLPad = 3;
1317     break;
1318   case 4:
1319     SRLPad = 4;
1320     break;
1321   default: llvm_unreachable("Invalid stack width");
1322   }
1323
1324   return DAG.getNode(ISD::SRL, SDLoc(Ptr), Ptr.getValueType(), Ptr,
1325                      DAG.getConstant(SRLPad, MVT::i32));
1326 }
1327
1328 void R600TargetLowering::getStackAddress(unsigned StackWidth,
1329                                          unsigned ElemIdx,
1330                                          unsigned &Channel,
1331                                          unsigned &PtrIncr) const {
1332   switch (StackWidth) {
1333   default:
1334   case 1:
1335     Channel = 0;
1336     if (ElemIdx > 0) {
1337       PtrIncr = 1;
1338     } else {
1339       PtrIncr = 0;
1340     }
1341     break;
1342   case 2:
1343     Channel = ElemIdx % 2;
1344     if (ElemIdx == 2) {
1345       PtrIncr = 1;
1346     } else {
1347       PtrIncr = 0;
1348     }
1349     break;
1350   case 4:
1351     Channel = ElemIdx;
1352     PtrIncr = 0;
1353     break;
1354   }
1355 }
1356
1357 SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
1358   SDLoc DL(Op);
1359   StoreSDNode *StoreNode = cast<StoreSDNode>(Op);
1360   SDValue Chain = Op.getOperand(0);
1361   SDValue Value = Op.getOperand(1);
1362   SDValue Ptr = Op.getOperand(2);
1363
1364   SDValue Result = AMDGPUTargetLowering::LowerSTORE(Op, DAG);
1365   if (Result.getNode()) {
1366     return Result;
1367   }
1368
1369   if (StoreNode->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS) {
1370     if (StoreNode->isTruncatingStore()) {
1371       EVT VT = Value.getValueType();
1372       assert(VT.bitsLE(MVT::i32));
1373       EVT MemVT = StoreNode->getMemoryVT();
1374       SDValue MaskConstant;
1375       if (MemVT == MVT::i8) {
1376         MaskConstant = DAG.getConstant(0xFF, MVT::i32);
1377       } else {
1378         assert(MemVT == MVT::i16);
1379         MaskConstant = DAG.getConstant(0xFFFF, MVT::i32);
1380       }
1381       SDValue DWordAddr = DAG.getNode(ISD::SRL, DL, VT, Ptr,
1382                                       DAG.getConstant(2, MVT::i32));
1383       SDValue ByteIndex = DAG.getNode(ISD::AND, DL, Ptr.getValueType(), Ptr,
1384                                       DAG.getConstant(0x00000003, VT));
1385       SDValue TruncValue = DAG.getNode(ISD::AND, DL, VT, Value, MaskConstant);
1386       SDValue Shift = DAG.getNode(ISD::SHL, DL, VT, ByteIndex,
1387                                    DAG.getConstant(3, VT));
1388       SDValue ShiftedValue = DAG.getNode(ISD::SHL, DL, VT, TruncValue, Shift);
1389       SDValue Mask = DAG.getNode(ISD::SHL, DL, VT, MaskConstant, Shift);
1390       // XXX: If we add a 64-bit ZW register class, then we could use a 2 x i32
1391       // vector instead.
1392       SDValue Src[4] = {
1393         ShiftedValue,
1394         DAG.getConstant(0, MVT::i32),
1395         DAG.getConstant(0, MVT::i32),
1396         Mask
1397       };
1398       SDValue Input = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v4i32, Src);
1399       SDValue Args[3] = { Chain, Input, DWordAddr };
1400       return DAG.getMemIntrinsicNode(AMDGPUISD::STORE_MSKOR, DL,
1401                                      Op->getVTList(), Args, MemVT,
1402                                      StoreNode->getMemOperand());
1403     } else if (Ptr->getOpcode() != AMDGPUISD::DWORDADDR &&
1404                Value.getValueType().bitsGE(MVT::i32)) {
1405       // Convert pointer from byte address to dword address.
1406       Ptr = DAG.getNode(AMDGPUISD::DWORDADDR, DL, Ptr.getValueType(),
1407                         DAG.getNode(ISD::SRL, DL, Ptr.getValueType(),
1408                                     Ptr, DAG.getConstant(2, MVT::i32)));
1409
1410       if (StoreNode->isTruncatingStore() || StoreNode->isIndexed()) {
1411         llvm_unreachable("Truncated and indexed stores not supported yet");
1412       } else {
1413         Chain = DAG.getStore(Chain, DL, Value, Ptr, StoreNode->getMemOperand());
1414       }
1415       return Chain;
1416     }
1417   }
1418
1419   EVT ValueVT = Value.getValueType();
1420
1421   if (StoreNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) {
1422     return SDValue();
1423   }
1424
1425   SDValue Ret = AMDGPUTargetLowering::LowerSTORE(Op, DAG);
1426   if (Ret.getNode()) {
1427     return Ret;
1428   }
1429   // Lowering for indirect addressing
1430
1431   const MachineFunction &MF = DAG.getMachineFunction();
1432   const AMDGPUFrameLowering *TFL = static_cast<const AMDGPUFrameLowering*>(
1433                                          getTargetMachine().getFrameLowering());
1434   unsigned StackWidth = TFL->getStackWidth(MF);
1435
1436   Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG);
1437
1438   if (ValueVT.isVector()) {
1439     unsigned NumElemVT = ValueVT.getVectorNumElements();
1440     EVT ElemVT = ValueVT.getVectorElementType();
1441     SmallVector<SDValue, 4> Stores(NumElemVT);
1442
1443     assert(NumElemVT >= StackWidth && "Stack width cannot be greater than "
1444                                       "vector width in load");
1445
1446     for (unsigned i = 0; i < NumElemVT; ++i) {
1447       unsigned Channel, PtrIncr;
1448       getStackAddress(StackWidth, i, Channel, PtrIncr);
1449       Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr,
1450                         DAG.getConstant(PtrIncr, MVT::i32));
1451       SDValue Elem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ElemVT,
1452                                  Value, DAG.getConstant(i, MVT::i32));
1453
1454       Stores[i] = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other,
1455                               Chain, Elem, Ptr,
1456                               DAG.getTargetConstant(Channel, MVT::i32));
1457     }
1458      Chain =  DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores);
1459    } else {
1460     if (ValueVT == MVT::i8) {
1461       Value = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Value);
1462     }
1463     Chain = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other, Chain, Value, Ptr,
1464     DAG.getTargetConstant(0, MVT::i32)); // Channel
1465   }
1466
1467   return Chain;
1468 }
1469
1470 // return (512 + (kc_bank << 12)
1471 static int
1472 ConstantAddressBlock(unsigned AddressSpace) {
1473   switch (AddressSpace) {
1474   case AMDGPUAS::CONSTANT_BUFFER_0:
1475     return 512;
1476   case AMDGPUAS::CONSTANT_BUFFER_1:
1477     return 512 + 4096;
1478   case AMDGPUAS::CONSTANT_BUFFER_2:
1479     return 512 + 4096 * 2;
1480   case AMDGPUAS::CONSTANT_BUFFER_3:
1481     return 512 + 4096 * 3;
1482   case AMDGPUAS::CONSTANT_BUFFER_4:
1483     return 512 + 4096 * 4;
1484   case AMDGPUAS::CONSTANT_BUFFER_5:
1485     return 512 + 4096 * 5;
1486   case AMDGPUAS::CONSTANT_BUFFER_6:
1487     return 512 + 4096 * 6;
1488   case AMDGPUAS::CONSTANT_BUFFER_7:
1489     return 512 + 4096 * 7;
1490   case AMDGPUAS::CONSTANT_BUFFER_8:
1491     return 512 + 4096 * 8;
1492   case AMDGPUAS::CONSTANT_BUFFER_9:
1493     return 512 + 4096 * 9;
1494   case AMDGPUAS::CONSTANT_BUFFER_10:
1495     return 512 + 4096 * 10;
1496   case AMDGPUAS::CONSTANT_BUFFER_11:
1497     return 512 + 4096 * 11;
1498   case AMDGPUAS::CONSTANT_BUFFER_12:
1499     return 512 + 4096 * 12;
1500   case AMDGPUAS::CONSTANT_BUFFER_13:
1501     return 512 + 4096 * 13;
1502   case AMDGPUAS::CONSTANT_BUFFER_14:
1503     return 512 + 4096 * 14;
1504   case AMDGPUAS::CONSTANT_BUFFER_15:
1505     return 512 + 4096 * 15;
1506   default:
1507     return -1;
1508   }
1509 }
1510
1511 SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const
1512 {
1513   EVT VT = Op.getValueType();
1514   SDLoc DL(Op);
1515   LoadSDNode *LoadNode = cast<LoadSDNode>(Op);
1516   SDValue Chain = Op.getOperand(0);
1517   SDValue Ptr = Op.getOperand(1);
1518   SDValue LoweredLoad;
1519
1520   SDValue Ret = AMDGPUTargetLowering::LowerLOAD(Op, DAG);
1521   if (Ret.getNode()) {
1522     SDValue Ops[2] = {
1523       Ret,
1524       Chain
1525     };
1526     return DAG.getMergeValues(Ops, DL);
1527   }
1528
1529
1530   if (LoadNode->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS && VT.isVector()) {
1531     SDValue MergedValues[2] = {
1532       SplitVectorLoad(Op, DAG),
1533       Chain
1534     };
1535     return DAG.getMergeValues(MergedValues, DL);
1536   }
1537
1538   int ConstantBlock = ConstantAddressBlock(LoadNode->getAddressSpace());
1539   if (ConstantBlock > -1 &&
1540       ((LoadNode->getExtensionType() == ISD::NON_EXTLOAD) ||
1541        (LoadNode->getExtensionType() == ISD::ZEXTLOAD))) {
1542     SDValue Result;
1543     if (isa<ConstantExpr>(LoadNode->getMemOperand()->getValue()) ||
1544         isa<Constant>(LoadNode->getMemOperand()->getValue()) ||
1545         isa<ConstantSDNode>(Ptr)) {
1546       SDValue Slots[4];
1547       for (unsigned i = 0; i < 4; i++) {
1548         // We want Const position encoded with the following formula :
1549         // (((512 + (kc_bank << 12) + const_index) << 2) + chan)
1550         // const_index is Ptr computed by llvm using an alignment of 16.
1551         // Thus we add (((512 + (kc_bank << 12)) + chan ) * 4 here and
1552         // then div by 4 at the ISel step
1553         SDValue NewPtr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
1554             DAG.getConstant(4 * i + ConstantBlock * 16, MVT::i32));
1555         Slots[i] = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::i32, NewPtr);
1556       }
1557       EVT NewVT = MVT::v4i32;
1558       unsigned NumElements = 4;
1559       if (VT.isVector()) {
1560         NewVT = VT;
1561         NumElements = VT.getVectorNumElements();
1562       }
1563       Result = DAG.getNode(ISD::BUILD_VECTOR, DL, NewVT,
1564                            makeArrayRef(Slots, NumElements));
1565     } else {
1566       // non-constant ptr can't be folded, keeps it as a v4f32 load
1567       Result = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::v4i32,
1568           DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr, DAG.getConstant(4, MVT::i32)),
1569           DAG.getConstant(LoadNode->getAddressSpace() -
1570                           AMDGPUAS::CONSTANT_BUFFER_0, MVT::i32)
1571           );
1572     }
1573
1574     if (!VT.isVector()) {
1575       Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Result,
1576           DAG.getConstant(0, MVT::i32));
1577     }
1578
1579     SDValue MergedValues[2] = {
1580       Result,
1581       Chain
1582     };
1583     return DAG.getMergeValues(MergedValues, DL);
1584   }
1585
1586   // For most operations returning SDValue() will result in the node being
1587   // expanded by the DAG Legalizer. This is not the case for ISD::LOAD, so we
1588   // need to manually expand loads that may be legal in some address spaces and
1589   // illegal in others. SEXT loads from CONSTANT_BUFFER_0 are supported for
1590   // compute shaders, since the data is sign extended when it is uploaded to the
1591   // buffer. However SEXT loads from other address spaces are not supported, so
1592   // we need to expand them here.
1593   if (LoadNode->getExtensionType() == ISD::SEXTLOAD) {
1594     EVT MemVT = LoadNode->getMemoryVT();
1595     assert(!MemVT.isVector() && (MemVT == MVT::i16 || MemVT == MVT::i8));
1596     SDValue ShiftAmount =
1597           DAG.getConstant(VT.getSizeInBits() - MemVT.getSizeInBits(), MVT::i32);
1598     SDValue NewLoad = DAG.getExtLoad(ISD::EXTLOAD, DL, VT, Chain, Ptr,
1599                                   LoadNode->getPointerInfo(), MemVT,
1600                                   LoadNode->isVolatile(),
1601                                   LoadNode->isNonTemporal(),
1602                                   LoadNode->getAlignment());
1603     SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, NewLoad, ShiftAmount);
1604     SDValue Sra = DAG.getNode(ISD::SRA, DL, VT, Shl, ShiftAmount);
1605
1606     SDValue MergedValues[2] = { Sra, Chain };
1607     return DAG.getMergeValues(MergedValues, DL);
1608   }
1609
1610   if (LoadNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) {
1611     return SDValue();
1612   }
1613
1614   // Lowering for indirect addressing
1615   const MachineFunction &MF = DAG.getMachineFunction();
1616   const AMDGPUFrameLowering *TFL = static_cast<const AMDGPUFrameLowering*>(
1617                                          getTargetMachine().getFrameLowering());
1618   unsigned StackWidth = TFL->getStackWidth(MF);
1619
1620   Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG);
1621
1622   if (VT.isVector()) {
1623     unsigned NumElemVT = VT.getVectorNumElements();
1624     EVT ElemVT = VT.getVectorElementType();
1625     SDValue Loads[4];
1626
1627     assert(NumElemVT >= StackWidth && "Stack width cannot be greater than "
1628                                       "vector width in load");
1629
1630     for (unsigned i = 0; i < NumElemVT; ++i) {
1631       unsigned Channel, PtrIncr;
1632       getStackAddress(StackWidth, i, Channel, PtrIncr);
1633       Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr,
1634                         DAG.getConstant(PtrIncr, MVT::i32));
1635       Loads[i] = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, ElemVT,
1636                              Chain, Ptr,
1637                              DAG.getTargetConstant(Channel, MVT::i32),
1638                              Op.getOperand(2));
1639     }
1640     for (unsigned i = NumElemVT; i < 4; ++i) {
1641       Loads[i] = DAG.getUNDEF(ElemVT);
1642     }
1643     EVT TargetVT = EVT::getVectorVT(*DAG.getContext(), ElemVT, 4);
1644     LoweredLoad = DAG.getNode(ISD::BUILD_VECTOR, DL, TargetVT, Loads);
1645   } else {
1646     LoweredLoad = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, VT,
1647                               Chain, Ptr,
1648                               DAG.getTargetConstant(0, MVT::i32), // Channel
1649                               Op.getOperand(2));
1650   }
1651
1652   SDValue Ops[2] = {
1653     LoweredLoad,
1654     Chain
1655   };
1656
1657   return DAG.getMergeValues(Ops, DL);
1658 }
1659
1660 SDValue R600TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
1661   SDValue Chain = Op.getOperand(0);
1662   SDValue Cond  = Op.getOperand(1);
1663   SDValue Jump  = Op.getOperand(2);
1664
1665   return DAG.getNode(AMDGPUISD::BRANCH_COND, SDLoc(Op), Op.getValueType(),
1666                      Chain, Jump, Cond);
1667 }
1668
1669 /// XXX Only kernel functions are supported, so we can assume for now that
1670 /// every function is a kernel function, but in the future we should use
1671 /// separate calling conventions for kernel and non-kernel functions.
1672 SDValue R600TargetLowering::LowerFormalArguments(
1673                                       SDValue Chain,
1674                                       CallingConv::ID CallConv,
1675                                       bool isVarArg,
1676                                       const SmallVectorImpl<ISD::InputArg> &Ins,
1677                                       SDLoc DL, SelectionDAG &DAG,
1678                                       SmallVectorImpl<SDValue> &InVals) const {
1679   SmallVector<CCValAssign, 16> ArgLocs;
1680   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
1681                  getTargetMachine(), ArgLocs, *DAG.getContext());
1682   MachineFunction &MF = DAG.getMachineFunction();
1683   unsigned ShaderType = MF.getInfo<R600MachineFunctionInfo>()->getShaderType();
1684
1685   SmallVector<ISD::InputArg, 8> LocalIns;
1686
1687   getOriginalFunctionArgs(DAG, MF.getFunction(), Ins, LocalIns);
1688
1689   AnalyzeFormalArguments(CCInfo, LocalIns);
1690
1691   for (unsigned i = 0, e = Ins.size(); i < e; ++i) {
1692     CCValAssign &VA = ArgLocs[i];
1693     EVT VT = Ins[i].VT;
1694     EVT MemVT = LocalIns[i].VT;
1695
1696     if (ShaderType != ShaderType::COMPUTE) {
1697       unsigned Reg = MF.addLiveIn(VA.getLocReg(), &AMDGPU::R600_Reg128RegClass);
1698       SDValue Register = DAG.getCopyFromReg(Chain, DL, Reg, VT);
1699       InVals.push_back(Register);
1700       continue;
1701     }
1702
1703     PointerType *PtrTy = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
1704                                                    AMDGPUAS::CONSTANT_BUFFER_0);
1705
1706     // i64 isn't a legal type, so the register type used ends up as i32, which
1707     // isn't expected here. It attempts to create this sextload, but it ends up
1708     // being invalid. Somehow this seems to work with i64 arguments, but breaks
1709     // for <1 x i64>.
1710
1711     // The first 36 bytes of the input buffer contains information about
1712     // thread group and global sizes.
1713
1714     // FIXME: This should really check the extload type, but the handling of
1715     // extload vecto parameters seems to be broken.
1716     //ISD::LoadExtType Ext = Ins[i].Flags.isSExt() ? ISD::SEXTLOAD : ISD::ZEXTLOAD;
1717     ISD::LoadExtType Ext = ISD::SEXTLOAD;
1718     SDValue Arg = DAG.getExtLoad(Ext, DL, VT, Chain,
1719                                  DAG.getConstant(36 + VA.getLocMemOffset(), MVT::i32),
1720                                  MachinePointerInfo(UndefValue::get(PtrTy)),
1721                                  MemVT, false, false, 4);
1722
1723     // 4 is the preferred alignment for the CONSTANT memory space.
1724     InVals.push_back(Arg);
1725   }
1726   return Chain;
1727 }
1728
1729 EVT R600TargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const {
1730    if (!VT.isVector())
1731      return MVT::i32;
1732    return VT.changeVectorElementTypeToInteger();
1733 }
1734
1735 static SDValue CompactSwizzlableVector(
1736   SelectionDAG &DAG, SDValue VectorEntry,
1737   DenseMap<unsigned, unsigned> &RemapSwizzle) {
1738   assert(VectorEntry.getOpcode() == ISD::BUILD_VECTOR);
1739   assert(RemapSwizzle.empty());
1740   SDValue NewBldVec[4] = {
1741     VectorEntry.getOperand(0),
1742     VectorEntry.getOperand(1),
1743     VectorEntry.getOperand(2),
1744     VectorEntry.getOperand(3)
1745   };
1746
1747   for (unsigned i = 0; i < 4; i++) {
1748     if (NewBldVec[i].getOpcode() == ISD::UNDEF)
1749       // We mask write here to teach later passes that the ith element of this
1750       // vector is undef. Thus we can use it to reduce 128 bits reg usage,
1751       // break false dependencies and additionnaly make assembly easier to read.
1752       RemapSwizzle[i] = 7; // SEL_MASK_WRITE
1753     if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(NewBldVec[i])) {
1754       if (C->isZero()) {
1755         RemapSwizzle[i] = 4; // SEL_0
1756         NewBldVec[i] = DAG.getUNDEF(MVT::f32);
1757       } else if (C->isExactlyValue(1.0)) {
1758         RemapSwizzle[i] = 5; // SEL_1
1759         NewBldVec[i] = DAG.getUNDEF(MVT::f32);
1760       }
1761     }
1762
1763     if (NewBldVec[i].getOpcode() == ISD::UNDEF)
1764       continue;
1765     for (unsigned j = 0; j < i; j++) {
1766       if (NewBldVec[i] == NewBldVec[j]) {
1767         NewBldVec[i] = DAG.getUNDEF(NewBldVec[i].getValueType());
1768         RemapSwizzle[i] = j;
1769         break;
1770       }
1771     }
1772   }
1773
1774   return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(VectorEntry),
1775                      VectorEntry.getValueType(), NewBldVec);
1776 }
1777
1778 static SDValue ReorganizeVector(SelectionDAG &DAG, SDValue VectorEntry,
1779                                 DenseMap<unsigned, unsigned> &RemapSwizzle) {
1780   assert(VectorEntry.getOpcode() == ISD::BUILD_VECTOR);
1781   assert(RemapSwizzle.empty());
1782   SDValue NewBldVec[4] = {
1783       VectorEntry.getOperand(0),
1784       VectorEntry.getOperand(1),
1785       VectorEntry.getOperand(2),
1786       VectorEntry.getOperand(3)
1787   };
1788   bool isUnmovable[4] = { false, false, false, false };
1789   for (unsigned i = 0; i < 4; i++) {
1790     RemapSwizzle[i] = i;
1791     if (NewBldVec[i].getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
1792       unsigned Idx = dyn_cast<ConstantSDNode>(NewBldVec[i].getOperand(1))
1793           ->getZExtValue();
1794       if (i == Idx)
1795         isUnmovable[Idx] = true;
1796     }
1797   }
1798
1799   for (unsigned i = 0; i < 4; i++) {
1800     if (NewBldVec[i].getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
1801       unsigned Idx = dyn_cast<ConstantSDNode>(NewBldVec[i].getOperand(1))
1802           ->getZExtValue();
1803       if (isUnmovable[Idx])
1804         continue;
1805       // Swap i and Idx
1806       std::swap(NewBldVec[Idx], NewBldVec[i]);
1807       std::swap(RemapSwizzle[i], RemapSwizzle[Idx]);
1808       break;
1809     }
1810   }
1811
1812   return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(VectorEntry),
1813                      VectorEntry.getValueType(), NewBldVec);
1814 }
1815
1816
1817 SDValue R600TargetLowering::OptimizeSwizzle(SDValue BuildVector,
1818 SDValue Swz[4], SelectionDAG &DAG) const {
1819   assert(BuildVector.getOpcode() == ISD::BUILD_VECTOR);
1820   // Old -> New swizzle values
1821   DenseMap<unsigned, unsigned> SwizzleRemap;
1822
1823   BuildVector = CompactSwizzlableVector(DAG, BuildVector, SwizzleRemap);
1824   for (unsigned i = 0; i < 4; i++) {
1825     unsigned Idx = dyn_cast<ConstantSDNode>(Swz[i])->getZExtValue();
1826     if (SwizzleRemap.find(Idx) != SwizzleRemap.end())
1827       Swz[i] = DAG.getConstant(SwizzleRemap[Idx], MVT::i32);
1828   }
1829
1830   SwizzleRemap.clear();
1831   BuildVector = ReorganizeVector(DAG, BuildVector, SwizzleRemap);
1832   for (unsigned i = 0; i < 4; i++) {
1833     unsigned Idx = dyn_cast<ConstantSDNode>(Swz[i])->getZExtValue();
1834     if (SwizzleRemap.find(Idx) != SwizzleRemap.end())
1835       Swz[i] = DAG.getConstant(SwizzleRemap[Idx], MVT::i32);
1836   }
1837
1838   return BuildVector;
1839 }
1840
1841
1842 //===----------------------------------------------------------------------===//
1843 // Custom DAG Optimizations
1844 //===----------------------------------------------------------------------===//
1845
1846 SDValue R600TargetLowering::PerformDAGCombine(SDNode *N,
1847                                               DAGCombinerInfo &DCI) const {
1848   SelectionDAG &DAG = DCI.DAG;
1849
1850   switch (N->getOpcode()) {
1851   default: return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
1852   // (f32 fp_round (f64 uint_to_fp a)) -> (f32 uint_to_fp a)
1853   case ISD::FP_ROUND: {
1854       SDValue Arg = N->getOperand(0);
1855       if (Arg.getOpcode() == ISD::UINT_TO_FP && Arg.getValueType() == MVT::f64) {
1856         return DAG.getNode(ISD::UINT_TO_FP, SDLoc(N), N->getValueType(0),
1857                            Arg.getOperand(0));
1858       }
1859       break;
1860     }
1861
1862   // (i32 fp_to_sint (fneg (select_cc f32, f32, 1.0, 0.0 cc))) ->
1863   // (i32 select_cc f32, f32, -1, 0 cc)
1864   //
1865   // Mesa's GLSL frontend generates the above pattern a lot and we can lower
1866   // this to one of the SET*_DX10 instructions.
1867   case ISD::FP_TO_SINT: {
1868     SDValue FNeg = N->getOperand(0);
1869     if (FNeg.getOpcode() != ISD::FNEG) {
1870       return SDValue();
1871     }
1872     SDValue SelectCC = FNeg.getOperand(0);
1873     if (SelectCC.getOpcode() != ISD::SELECT_CC ||
1874         SelectCC.getOperand(0).getValueType() != MVT::f32 || // LHS
1875         SelectCC.getOperand(2).getValueType() != MVT::f32 || // True
1876         !isHWTrueValue(SelectCC.getOperand(2)) ||
1877         !isHWFalseValue(SelectCC.getOperand(3))) {
1878       return SDValue();
1879     }
1880
1881     return DAG.getNode(ISD::SELECT_CC, SDLoc(N), N->getValueType(0),
1882                            SelectCC.getOperand(0), // LHS
1883                            SelectCC.getOperand(1), // RHS
1884                            DAG.getConstant(-1, MVT::i32), // True
1885                            DAG.getConstant(0, MVT::i32),  // Flase
1886                            SelectCC.getOperand(4)); // CC
1887
1888     break;
1889   }
1890
1891   // insert_vector_elt (build_vector elt0, ... , eltN), NewEltIdx, idx
1892   // => build_vector elt0, ... , NewEltIdx, ... , eltN
1893   case ISD::INSERT_VECTOR_ELT: {
1894     SDValue InVec = N->getOperand(0);
1895     SDValue InVal = N->getOperand(1);
1896     SDValue EltNo = N->getOperand(2);
1897     SDLoc dl(N);
1898
1899     // If the inserted element is an UNDEF, just use the input vector.
1900     if (InVal.getOpcode() == ISD::UNDEF)
1901       return InVec;
1902
1903     EVT VT = InVec.getValueType();
1904
1905     // If we can't generate a legal BUILD_VECTOR, exit
1906     if (!isOperationLegal(ISD::BUILD_VECTOR, VT))
1907       return SDValue();
1908
1909     // Check that we know which element is being inserted
1910     if (!isa<ConstantSDNode>(EltNo))
1911       return SDValue();
1912     unsigned Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
1913
1914     // Check that the operand is a BUILD_VECTOR (or UNDEF, which can essentially
1915     // be converted to a BUILD_VECTOR).  Fill in the Ops vector with the
1916     // vector elements.
1917     SmallVector<SDValue, 8> Ops;
1918     if (InVec.getOpcode() == ISD::BUILD_VECTOR) {
1919       Ops.append(InVec.getNode()->op_begin(),
1920                  InVec.getNode()->op_end());
1921     } else if (InVec.getOpcode() == ISD::UNDEF) {
1922       unsigned NElts = VT.getVectorNumElements();
1923       Ops.append(NElts, DAG.getUNDEF(InVal.getValueType()));
1924     } else {
1925       return SDValue();
1926     }
1927
1928     // Insert the element
1929     if (Elt < Ops.size()) {
1930       // All the operands of BUILD_VECTOR must have the same type;
1931       // we enforce that here.
1932       EVT OpVT = Ops[0].getValueType();
1933       if (InVal.getValueType() != OpVT)
1934         InVal = OpVT.bitsGT(InVal.getValueType()) ?
1935           DAG.getNode(ISD::ANY_EXTEND, dl, OpVT, InVal) :
1936           DAG.getNode(ISD::TRUNCATE, dl, OpVT, InVal);
1937       Ops[Elt] = InVal;
1938     }
1939
1940     // Return the new vector
1941     return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
1942   }
1943
1944   // Extract_vec (Build_vector) generated by custom lowering
1945   // also needs to be customly combined
1946   case ISD::EXTRACT_VECTOR_ELT: {
1947     SDValue Arg = N->getOperand(0);
1948     if (Arg.getOpcode() == ISD::BUILD_VECTOR) {
1949       if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
1950         unsigned Element = Const->getZExtValue();
1951         return Arg->getOperand(Element);
1952       }
1953     }
1954     if (Arg.getOpcode() == ISD::BITCAST &&
1955         Arg.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
1956       if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
1957         unsigned Element = Const->getZExtValue();
1958         return DAG.getNode(ISD::BITCAST, SDLoc(N), N->getVTList(),
1959             Arg->getOperand(0).getOperand(Element));
1960       }
1961     }
1962   }
1963
1964   case ISD::SELECT_CC: {
1965     // Try common optimizations
1966     SDValue Ret = AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
1967     if (Ret.getNode())
1968       return Ret;
1969
1970     // fold selectcc (selectcc x, y, a, b, cc), b, a, b, seteq ->
1971     //      selectcc x, y, a, b, inv(cc)
1972     //
1973     // fold selectcc (selectcc x, y, a, b, cc), b, a, b, setne ->
1974     //      selectcc x, y, a, b, cc
1975     SDValue LHS = N->getOperand(0);
1976     if (LHS.getOpcode() != ISD::SELECT_CC) {
1977       return SDValue();
1978     }
1979
1980     SDValue RHS = N->getOperand(1);
1981     SDValue True = N->getOperand(2);
1982     SDValue False = N->getOperand(3);
1983     ISD::CondCode NCC = cast<CondCodeSDNode>(N->getOperand(4))->get();
1984
1985     if (LHS.getOperand(2).getNode() != True.getNode() ||
1986         LHS.getOperand(3).getNode() != False.getNode() ||
1987         RHS.getNode() != False.getNode()) {
1988       return SDValue();
1989     }
1990
1991     switch (NCC) {
1992     default: return SDValue();
1993     case ISD::SETNE: return LHS;
1994     case ISD::SETEQ: {
1995       ISD::CondCode LHSCC = cast<CondCodeSDNode>(LHS.getOperand(4))->get();
1996       LHSCC = ISD::getSetCCInverse(LHSCC,
1997                                   LHS.getOperand(0).getValueType().isInteger());
1998       if (DCI.isBeforeLegalizeOps() ||
1999           isCondCodeLegal(LHSCC, LHS.getOperand(0).getSimpleValueType()))
2000         return DAG.getSelectCC(SDLoc(N),
2001                                LHS.getOperand(0),
2002                                LHS.getOperand(1),
2003                                LHS.getOperand(2),
2004                                LHS.getOperand(3),
2005                                LHSCC);
2006       break;
2007     }
2008     }
2009     return SDValue();
2010   }
2011
2012   case AMDGPUISD::EXPORT: {
2013     SDValue Arg = N->getOperand(1);
2014     if (Arg.getOpcode() != ISD::BUILD_VECTOR)
2015       break;
2016
2017     SDValue NewArgs[8] = {
2018       N->getOperand(0), // Chain
2019       SDValue(),
2020       N->getOperand(2), // ArrayBase
2021       N->getOperand(3), // Type
2022       N->getOperand(4), // SWZ_X
2023       N->getOperand(5), // SWZ_Y
2024       N->getOperand(6), // SWZ_Z
2025       N->getOperand(7) // SWZ_W
2026     };
2027     SDLoc DL(N);
2028     NewArgs[1] = OptimizeSwizzle(N->getOperand(1), &NewArgs[4], DAG);
2029     return DAG.getNode(AMDGPUISD::EXPORT, DL, N->getVTList(), NewArgs);
2030   }
2031   case AMDGPUISD::TEXTURE_FETCH: {
2032     SDValue Arg = N->getOperand(1);
2033     if (Arg.getOpcode() != ISD::BUILD_VECTOR)
2034       break;
2035
2036     SDValue NewArgs[19] = {
2037       N->getOperand(0),
2038       N->getOperand(1),
2039       N->getOperand(2),
2040       N->getOperand(3),
2041       N->getOperand(4),
2042       N->getOperand(5),
2043       N->getOperand(6),
2044       N->getOperand(7),
2045       N->getOperand(8),
2046       N->getOperand(9),
2047       N->getOperand(10),
2048       N->getOperand(11),
2049       N->getOperand(12),
2050       N->getOperand(13),
2051       N->getOperand(14),
2052       N->getOperand(15),
2053       N->getOperand(16),
2054       N->getOperand(17),
2055       N->getOperand(18),
2056     };
2057     NewArgs[1] = OptimizeSwizzle(N->getOperand(1), &NewArgs[2], DAG);
2058     return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, SDLoc(N), N->getVTList(),
2059         NewArgs);
2060   }
2061   }
2062
2063   return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
2064 }
2065
2066 static bool
2067 FoldOperand(SDNode *ParentNode, unsigned SrcIdx, SDValue &Src, SDValue &Neg,
2068             SDValue &Abs, SDValue &Sel, SDValue &Imm, SelectionDAG &DAG) {
2069   const R600InstrInfo *TII =
2070       static_cast<const R600InstrInfo *>(DAG.getTarget().getInstrInfo());
2071   if (!Src.isMachineOpcode())
2072     return false;
2073   switch (Src.getMachineOpcode()) {
2074   case AMDGPU::FNEG_R600:
2075     if (!Neg.getNode())
2076       return false;
2077     Src = Src.getOperand(0);
2078     Neg = DAG.getTargetConstant(1, MVT::i32);
2079     return true;
2080   case AMDGPU::FABS_R600:
2081     if (!Abs.getNode())
2082       return false;
2083     Src = Src.getOperand(0);
2084     Abs = DAG.getTargetConstant(1, MVT::i32);
2085     return true;
2086   case AMDGPU::CONST_COPY: {
2087     unsigned Opcode = ParentNode->getMachineOpcode();
2088     bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1;
2089
2090     if (!Sel.getNode())
2091       return false;
2092
2093     SDValue CstOffset = Src.getOperand(0);
2094     if (ParentNode->getValueType(0).isVector())
2095       return false;
2096
2097     // Gather constants values
2098     int SrcIndices[] = {
2099       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0),
2100       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1),
2101       TII->getOperandIdx(Opcode, AMDGPU::OpName::src2),
2102       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_X),
2103       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Y),
2104       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Z),
2105       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_W),
2106       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_X),
2107       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Y),
2108       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Z),
2109       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_W)
2110     };
2111     std::vector<unsigned> Consts;
2112     for (int OtherSrcIdx : SrcIndices) {
2113       int OtherSelIdx = TII->getSelIdx(Opcode, OtherSrcIdx);
2114       if (OtherSrcIdx < 0 || OtherSelIdx < 0)
2115         continue;
2116       if (HasDst) {
2117         OtherSrcIdx--;
2118         OtherSelIdx--;
2119       }
2120       if (RegisterSDNode *Reg =
2121           dyn_cast<RegisterSDNode>(ParentNode->getOperand(OtherSrcIdx))) {
2122         if (Reg->getReg() == AMDGPU::ALU_CONST) {
2123           ConstantSDNode *Cst
2124             = cast<ConstantSDNode>(ParentNode->getOperand(OtherSelIdx));
2125           Consts.push_back(Cst->getZExtValue());
2126         }
2127       }
2128     }
2129
2130     ConstantSDNode *Cst = cast<ConstantSDNode>(CstOffset);
2131     Consts.push_back(Cst->getZExtValue());
2132     if (!TII->fitsConstReadLimitations(Consts)) {
2133       return false;
2134     }
2135
2136     Sel = CstOffset;
2137     Src = DAG.getRegister(AMDGPU::ALU_CONST, MVT::f32);
2138     return true;
2139   }
2140   case AMDGPU::MOV_IMM_I32:
2141   case AMDGPU::MOV_IMM_F32: {
2142     unsigned ImmReg = AMDGPU::ALU_LITERAL_X;
2143     uint64_t ImmValue = 0;
2144
2145
2146     if (Src.getMachineOpcode() == AMDGPU::MOV_IMM_F32) {
2147       ConstantFPSDNode *FPC = dyn_cast<ConstantFPSDNode>(Src.getOperand(0));
2148       float FloatValue = FPC->getValueAPF().convertToFloat();
2149       if (FloatValue == 0.0) {
2150         ImmReg = AMDGPU::ZERO;
2151       } else if (FloatValue == 0.5) {
2152         ImmReg = AMDGPU::HALF;
2153       } else if (FloatValue == 1.0) {
2154         ImmReg = AMDGPU::ONE;
2155       } else {
2156         ImmValue = FPC->getValueAPF().bitcastToAPInt().getZExtValue();
2157       }
2158     } else {
2159       ConstantSDNode *C = dyn_cast<ConstantSDNode>(Src.getOperand(0));
2160       uint64_t Value = C->getZExtValue();
2161       if (Value == 0) {
2162         ImmReg = AMDGPU::ZERO;
2163       } else if (Value == 1) {
2164         ImmReg = AMDGPU::ONE_INT;
2165       } else {
2166         ImmValue = Value;
2167       }
2168     }
2169
2170     // Check that we aren't already using an immediate.
2171     // XXX: It's possible for an instruction to have more than one
2172     // immediate operand, but this is not supported yet.
2173     if (ImmReg == AMDGPU::ALU_LITERAL_X) {
2174       if (!Imm.getNode())
2175         return false;
2176       ConstantSDNode *C = dyn_cast<ConstantSDNode>(Imm);
2177       assert(C);
2178       if (C->getZExtValue())
2179         return false;
2180       Imm = DAG.getTargetConstant(ImmValue, MVT::i32);
2181     }
2182     Src = DAG.getRegister(ImmReg, MVT::i32);
2183     return true;
2184   }
2185   default:
2186     return false;
2187   }
2188 }
2189
2190
2191 /// \brief Fold the instructions after selecting them
2192 SDNode *R600TargetLowering::PostISelFolding(MachineSDNode *Node,
2193                                             SelectionDAG &DAG) const {
2194   const R600InstrInfo *TII =
2195       static_cast<const R600InstrInfo *>(DAG.getTarget().getInstrInfo());
2196   if (!Node->isMachineOpcode())
2197     return Node;
2198   unsigned Opcode = Node->getMachineOpcode();
2199   SDValue FakeOp;
2200
2201   std::vector<SDValue> Ops;
2202   for (const SDUse &I : Node->ops())
2203     Ops.push_back(I);
2204
2205   if (Opcode == AMDGPU::DOT_4) {
2206     int OperandIdx[] = {
2207       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_X),
2208       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Y),
2209       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Z),
2210       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_W),
2211       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_X),
2212       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Y),
2213       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Z),
2214       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_W)
2215         };
2216     int NegIdx[] = {
2217       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_X),
2218       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_Y),
2219       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_Z),
2220       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_W),
2221       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_X),
2222       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_Y),
2223       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_Z),
2224       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_W)
2225     };
2226     int AbsIdx[] = {
2227       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_X),
2228       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_Y),
2229       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_Z),
2230       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_W),
2231       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_X),
2232       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_Y),
2233       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_Z),
2234       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_W)
2235     };
2236     for (unsigned i = 0; i < 8; i++) {
2237       if (OperandIdx[i] < 0)
2238         return Node;
2239       SDValue &Src = Ops[OperandIdx[i] - 1];
2240       SDValue &Neg = Ops[NegIdx[i] - 1];
2241       SDValue &Abs = Ops[AbsIdx[i] - 1];
2242       bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1;
2243       int SelIdx = TII->getSelIdx(Opcode, OperandIdx[i]);
2244       if (HasDst)
2245         SelIdx--;
2246       SDValue &Sel = (SelIdx > -1) ? Ops[SelIdx] : FakeOp;
2247       if (FoldOperand(Node, i, Src, Neg, Abs, Sel, FakeOp, DAG))
2248         return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
2249     }
2250   } else if (Opcode == AMDGPU::REG_SEQUENCE) {
2251     for (unsigned i = 1, e = Node->getNumOperands(); i < e; i += 2) {
2252       SDValue &Src = Ops[i];
2253       if (FoldOperand(Node, i, Src, FakeOp, FakeOp, FakeOp, FakeOp, DAG))
2254         return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
2255     }
2256   } else if (Opcode == AMDGPU::CLAMP_R600) {
2257     SDValue Src = Node->getOperand(0);
2258     if (!Src.isMachineOpcode() ||
2259         !TII->hasInstrModifiers(Src.getMachineOpcode()))
2260       return Node;
2261     int ClampIdx = TII->getOperandIdx(Src.getMachineOpcode(),
2262         AMDGPU::OpName::clamp);
2263     if (ClampIdx < 0)
2264       return Node;
2265     std::vector<SDValue> Ops;
2266     unsigned NumOp = Src.getNumOperands();
2267     for(unsigned i = 0; i < NumOp; ++i)
2268           Ops.push_back(Src.getOperand(i));
2269     Ops[ClampIdx - 1] = DAG.getTargetConstant(1, MVT::i32);
2270     return DAG.getMachineNode(Src.getMachineOpcode(), SDLoc(Node),
2271         Node->getVTList(), Ops);
2272   } else {
2273     if (!TII->hasInstrModifiers(Opcode))
2274       return Node;
2275     int OperandIdx[] = {
2276       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0),
2277       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1),
2278       TII->getOperandIdx(Opcode, AMDGPU::OpName::src2)
2279     };
2280     int NegIdx[] = {
2281       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg),
2282       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg),
2283       TII->getOperandIdx(Opcode, AMDGPU::OpName::src2_neg)
2284     };
2285     int AbsIdx[] = {
2286       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs),
2287       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs),
2288       -1
2289     };
2290     for (unsigned i = 0; i < 3; i++) {
2291       if (OperandIdx[i] < 0)
2292         return Node;
2293       SDValue &Src = Ops[OperandIdx[i] - 1];
2294       SDValue &Neg = Ops[NegIdx[i] - 1];
2295       SDValue FakeAbs;
2296       SDValue &Abs = (AbsIdx[i] > -1) ? Ops[AbsIdx[i] - 1] : FakeAbs;
2297       bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1;
2298       int SelIdx = TII->getSelIdx(Opcode, OperandIdx[i]);
2299       int ImmIdx = TII->getOperandIdx(Opcode, AMDGPU::OpName::literal);
2300       if (HasDst) {
2301         SelIdx--;
2302         ImmIdx--;
2303       }
2304       SDValue &Sel = (SelIdx > -1) ? Ops[SelIdx] : FakeOp;
2305       SDValue &Imm = Ops[ImmIdx];
2306       if (FoldOperand(Node, i, Src, Neg, Abs, Sel, Imm, DAG))
2307         return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
2308     }
2309   }
2310
2311   return Node;
2312 }