lib/Target/R600/R600ISelLowering.cpp

   1 //===-- R600ISelLowering.cpp - R600 DAG Lowering Implementation -----------===//
   2 //
   3 //                     The LLVM Compiler Infrastructure
   4 //
   5 // This file is distributed under the University of Illinois Open Source
   6 // License. See LICENSE.TXT for details.
   7 //
   8 //===----------------------------------------------------------------------===//
   9 //
  10 /// \file
  11 /// \brief Custom DAG lowering for R600
  12 //
  13 //===----------------------------------------------------------------------===//
  14
  15 #include "R600ISelLowering.h"
  16 #include "AMDILIntrinsicInfo.h"
  17 #include "AMDGPUFrameLowering.h"
  18 #include "AMDGPUSubtarget.h"
  19 #include "R600Defines.h"
  20 #include "R600InstrInfo.h"
  21 #include "R600MachineFunctionInfo.h"
  22 #include "llvm/CodeGen/CallingConvLower.h"
  23 #include "llvm/CodeGen/MachineFrameInfo.h"
  24 #include "llvm/CodeGen/MachineInstrBuilder.h"
  25 #include "llvm/CodeGen/MachineRegisterInfo.h"
  26 #include "llvm/CodeGen/SelectionDAG.h"
  27 #include "llvm/IR/Argument.h"
  28 #include "llvm/IR/Function.h"
  29
  30 using namespace llvm;
  31
  32 R600TargetLowering::R600TargetLowering(TargetMachine &TM) :
  33     AMDGPUTargetLowering(TM),
  34     Gen(TM.getSubtarget<AMDGPUSubtarget>().getGeneration()) {
  35   addRegisterClass(MVT::v4f32, &AMDGPU::R600_Reg128RegClass);
  36   addRegisterClass(MVT::f32, &AMDGPU::R600_Reg32RegClass);
  37   addRegisterClass(MVT::v4i32, &AMDGPU::R600_Reg128RegClass);
  38   addRegisterClass(MVT::i32, &AMDGPU::R600_Reg32RegClass);
  39   addRegisterClass(MVT::v2f32, &AMDGPU::R600_Reg64RegClass);
  40   addRegisterClass(MVT::v2i32, &AMDGPU::R600_Reg64RegClass);
  41
  42   computeRegisterProperties();
  43
  44   // Set condition code actions
  45   setCondCodeAction(ISD::SETO,   MVT::f32, Expand);
  46   setCondCodeAction(ISD::SETUO,  MVT::f32, Expand);
  47   setCondCodeAction(ISD::SETLT,  MVT::f32, Expand);
  48   setCondCodeAction(ISD::SETLE,  MVT::f32, Expand);
  49   setCondCodeAction(ISD::SETOLT, MVT::f32, Expand);
  50   setCondCodeAction(ISD::SETOLE, MVT::f32, Expand);
  51   setCondCodeAction(ISD::SETONE, MVT::f32, Expand);
  52   setCondCodeAction(ISD::SETUEQ, MVT::f32, Expand);
  53   setCondCodeAction(ISD::SETUGE, MVT::f32, Expand);
  54   setCondCodeAction(ISD::SETUGT, MVT::f32, Expand);
  55   setCondCodeAction(ISD::SETULT, MVT::f32, Expand);
  56   setCondCodeAction(ISD::SETULE, MVT::f32, Expand);
  57
  58   setCondCodeAction(ISD::SETLE, MVT::i32, Expand);
  59   setCondCodeAction(ISD::SETLT, MVT::i32, Expand);
  60   setCondCodeAction(ISD::SETULE, MVT::i32, Expand);
  61   setCondCodeAction(ISD::SETULT, MVT::i32, Expand);
  62
  63   setOperationAction(ISD::FCOS, MVT::f32, Custom);
  64   setOperationAction(ISD::FSIN, MVT::f32, Custom);
  65
  66   setOperationAction(ISD::SETCC, MVT::v4i32, Expand);
  67   setOperationAction(ISD::SETCC, MVT::v2i32, Expand);
  68
  69   setOperationAction(ISD::BR_CC, MVT::i32, Expand);
  70   setOperationAction(ISD::BR_CC, MVT::f32, Expand);
  71
  72   setOperationAction(ISD::FSUB, MVT::f32, Expand);
  73
  74   setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
  75   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
  76   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i1, Custom);
  77
  78   setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
  79   setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
  80
  81   setOperationAction(ISD::SETCC, MVT::i32, Expand);
  82   setOperationAction(ISD::SETCC, MVT::f32, Expand);
  83   setOperationAction(ISD::FP_TO_UINT, MVT::i1, Custom);
  84
  85   setOperationAction(ISD::SELECT, MVT::i32, Expand);
  86   setOperationAction(ISD::SELECT, MVT::f32, Expand);
  87   setOperationAction(ISD::SELECT, MVT::v2i32, Expand);
  88   setOperationAction(ISD::SELECT, MVT::v4i32, Expand);
  89
  90   // Expand sign extension of vectors
  91   if (!Subtarget->hasBFE())
  92     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
  93
  94   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i1, Expand);
  95   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i1, Expand);
  96
  97   if (!Subtarget->hasBFE())
  98     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8, Expand);
  99   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i8, Expand);
 100   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i8, Expand);
 101
 102   if (!Subtarget->hasBFE())
 103     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Expand);
 104   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Expand);
 105   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Expand);
 106
 107   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
 108   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i32, Expand);
 109   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i32, Expand);
 110
 111   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::Other, Expand);
 112
 113
 114   // Legalize loads and stores to the private address space.
 115   setOperationAction(ISD::LOAD, MVT::i32, Custom);
 116   setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
 117   setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
 118
 119   // EXTLOAD should be the same as ZEXTLOAD. It is legal for some address
 120   // spaces, so it is custom lowered to handle those where it isn't.
 121   setLoadExtAction(ISD::SEXTLOAD, MVT::i8, Custom);
 122   setLoadExtAction(ISD::SEXTLOAD, MVT::i16, Custom);
 123   setLoadExtAction(ISD::ZEXTLOAD, MVT::i8, Custom);
 124   setLoadExtAction(ISD::ZEXTLOAD, MVT::i16, Custom);
 125   setLoadExtAction(ISD::EXTLOAD, MVT::i8, Custom);
 126   setLoadExtAction(ISD::EXTLOAD, MVT::i16, Custom);
 127
 128   setOperationAction(ISD::STORE, MVT::i8, Custom);
 129   setOperationAction(ISD::STORE, MVT::i32, Custom);
 130   setOperationAction(ISD::STORE, MVT::v2i32, Custom);
 131   setOperationAction(ISD::STORE, MVT::v4i32, Custom);
 132   setTruncStoreAction(MVT::i32, MVT::i8, Custom);
 133   setTruncStoreAction(MVT::i32, MVT::i16, Custom);
 134
 135   setOperationAction(ISD::LOAD, MVT::i32, Custom);
 136   setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
 137   setOperationAction(ISD::FrameIndex, MVT::i32, Custom);
 138
 139   setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i32, Custom);
 140   setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f32, Custom);
 141   setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom);
 142   setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
 143
 144   setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i32, Custom);
 145   setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f32, Custom);
 146   setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom);
 147   setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom);
 148
 149   setTargetDAGCombine(ISD::FP_ROUND);
 150   setTargetDAGCombine(ISD::FP_TO_SINT);
 151   setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
 152   setTargetDAGCombine(ISD::SELECT_CC);
 153   setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
 154
 155   // These should be replaced by UDVIREM, but it does not happen automatically
 156   // during Type Legalization
 157   setOperationAction(ISD::UDIV, MVT::i64, Custom);
 158   setOperationAction(ISD::UREM, MVT::i64, Custom);
 159
 160   // We don't have 64-bit shifts. Thus we need either SHX i64 or SHX_PARTS i32
 161   //  to be Legal/Custom in order to avoid library calls.
 162   setOperationAction(ISD::SHL_PARTS, MVT::i32, Custom);
 163   setOperationAction(ISD::SRL_PARTS, MVT::i32, Custom);
 164   setOperationAction(ISD::SRA_PARTS, MVT::i32, Custom);
 165
 166   setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
 167
 168   setBooleanContents(ZeroOrNegativeOneBooleanContent);
 169   setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
 170   setSchedulingPreference(Sched::Source);
 171 }
 172
 173 MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter(
 174     MachineInstr * MI, MachineBasicBlock * BB) const {
 175   MachineFunction * MF = BB->getParent();
 176   MachineRegisterInfo &MRI = MF->getRegInfo();
 177   MachineBasicBlock::iterator I = *MI;
 178   const R600InstrInfo *TII =
 179     static_cast<const R600InstrInfo*>(MF->getTarget().getInstrInfo());
 180
 181   switch (MI->getOpcode()) {
 182   default:
 183     // Replace LDS_*_RET instruction that don't have any uses with the
 184     // equivalent LDS_*_NORET instruction.
 185     if (TII->isLDSRetInstr(MI->getOpcode())) {
 186       int DstIdx = TII->getOperandIdx(MI->getOpcode(), AMDGPU::OpName::dst);
 187       assert(DstIdx != -1);
 188       MachineInstrBuilder NewMI;
 189       if (!MRI.use_empty(MI->getOperand(DstIdx).getReg()))
 190         return BB;
 191
 192       NewMI = BuildMI(*BB, I, BB->findDebugLoc(I),
 193                       TII->get(AMDGPU::getLDSNoRetOp(MI->getOpcode())));
 194       for (unsigned i = 1, e = MI->getNumOperands(); i < e; ++i) {
 195         NewMI.addOperand(MI->getOperand(i));
 196       }
 197     } else {
 198       return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
 199     }
 200     break;
 201   case AMDGPU::CLAMP_R600: {
 202     MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
 203                                                    AMDGPU::MOV,
 204                                                    MI->getOperand(0).getReg(),
 205                                                    MI->getOperand(1).getReg());
 206     TII->addFlag(NewMI, 0, MO_FLAG_CLAMP);
 207     break;
 208   }
 209
 210   case AMDGPU::FABS_R600: {
 211     MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
 212                                                     AMDGPU::MOV,
 213                                                     MI->getOperand(0).getReg(),
 214                                                     MI->getOperand(1).getReg());
 215     TII->addFlag(NewMI, 0, MO_FLAG_ABS);
 216     break;
 217   }
 218
 219   case AMDGPU::FNEG_R600: {
 220     MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
 221                                                     AMDGPU::MOV,
 222                                                     MI->getOperand(0).getReg(),
 223                                                     MI->getOperand(1).getReg());
 224     TII->addFlag(NewMI, 0, MO_FLAG_NEG);
 225     break;
 226   }
 227
 228   case AMDGPU::MASK_WRITE: {
 229     unsigned maskedRegister = MI->getOperand(0).getReg();
 230     assert(TargetRegisterInfo::isVirtualRegister(maskedRegister));
 231     MachineInstr * defInstr = MRI.getVRegDef(maskedRegister);
 232     TII->addFlag(defInstr, 0, MO_FLAG_MASK);
 233     break;
 234   }
 235
 236   case AMDGPU::MOV_IMM_F32:
 237     TII->buildMovImm(*BB, I, MI->getOperand(0).getReg(),
 238                      MI->getOperand(1).getFPImm()->getValueAPF()
 239                          .bitcastToAPInt().getZExtValue());
 240     break;
 241   case AMDGPU::MOV_IMM_I32:
 242     TII->buildMovImm(*BB, I, MI->getOperand(0).getReg(),
 243                      MI->getOperand(1).getImm());
 244     break;
 245   case AMDGPU::CONST_COPY: {
 246     MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, MI, AMDGPU::MOV,
 247         MI->getOperand(0).getReg(), AMDGPU::ALU_CONST);
 248     TII->setImmOperand(NewMI, AMDGPU::OpName::src0_sel,
 249         MI->getOperand(1).getImm());
 250     break;
 251   }
 252
 253   case AMDGPU::RAT_WRITE_CACHELESS_32_eg:
 254   case AMDGPU::RAT_WRITE_CACHELESS_64_eg:
 255   case AMDGPU::RAT_WRITE_CACHELESS_128_eg: {
 256     unsigned EOP = (std::next(I)->getOpcode() == AMDGPU::RETURN) ? 1 : 0;
 257
 258     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode()))
 259             .addOperand(MI->getOperand(0))
 260             .addOperand(MI->getOperand(1))
 261             .addImm(EOP); // Set End of program bit
 262     break;
 263   }
 264
 265   case AMDGPU::TXD: {
 266     unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
 267     unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
 268     MachineOperand &RID = MI->getOperand(4);
 269     MachineOperand &SID = MI->getOperand(5);
 270     unsigned TextureId = MI->getOperand(6).getImm();
 271     unsigned SrcX = 0, SrcY = 1, SrcZ = 2, SrcW = 3;
 272     unsigned CTX = 1, CTY = 1, CTZ = 1, CTW = 1;
 273
 274     switch (TextureId) {
 275     case 5: // Rect
 276       CTX = CTY = 0;
 277       break;
 278     case 6: // Shadow1D
 279       SrcW = SrcZ;
 280       break;
 281     case 7: // Shadow2D
 282       SrcW = SrcZ;
 283       break;
 284     case 8: // ShadowRect
 285       CTX = CTY = 0;
 286       SrcW = SrcZ;
 287       break;
 288     case 9: // 1DArray
 289       SrcZ = SrcY;
 290       CTZ = 0;
 291       break;
 292     case 10: // 2DArray
 293       CTZ = 0;
 294       break;
 295     case 11: // Shadow1DArray
 296       SrcZ = SrcY;
 297       CTZ = 0;
 298       break;
 299     case 12: // Shadow2DArray
 300       CTZ = 0;
 301       break;
 302     }
 303     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), T0)
 304             .addOperand(MI->getOperand(3))
 305             .addImm(SrcX)
 306             .addImm(SrcY)
 307             .addImm(SrcZ)
 308             .addImm(SrcW)
 309             .addImm(0)
 310             .addImm(0)
 311             .addImm(0)
 312             .addImm(0)
 313             .addImm(1)
 314             .addImm(2)
 315             .addImm(3)
 316             .addOperand(RID)
 317             .addOperand(SID)
 318             .addImm(CTX)
 319             .addImm(CTY)
 320             .addImm(CTZ)
 321             .addImm(CTW);
 322     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), T1)
 323             .addOperand(MI->getOperand(2))
 324             .addImm(SrcX)
 325             .addImm(SrcY)
 326             .addImm(SrcZ)
 327             .addImm(SrcW)
 328             .addImm(0)
 329             .addImm(0)
 330             .addImm(0)
 331             .addImm(0)
 332             .addImm(1)
 333             .addImm(2)
 334             .addImm(3)
 335             .addOperand(RID)
 336             .addOperand(SID)
 337             .addImm(CTX)
 338             .addImm(CTY)
 339             .addImm(CTZ)
 340             .addImm(CTW);
 341     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_G))
 342             .addOperand(MI->getOperand(0))
 343             .addOperand(MI->getOperand(1))
 344             .addImm(SrcX)
 345             .addImm(SrcY)
 346             .addImm(SrcZ)
 347             .addImm(SrcW)
 348             .addImm(0)
 349             .addImm(0)
 350             .addImm(0)
 351             .addImm(0)
 352             .addImm(1)
 353             .addImm(2)
 354             .addImm(3)
 355             .addOperand(RID)
 356             .addOperand(SID)
 357             .addImm(CTX)
 358             .addImm(CTY)
 359             .addImm(CTZ)
 360             .addImm(CTW)
 361             .addReg(T0, RegState::Implicit)
 362             .addReg(T1, RegState::Implicit);
 363     break;
 364   }
 365
 366   case AMDGPU::TXD_SHADOW: {
 367     unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
 368     unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
 369     MachineOperand &RID = MI->getOperand(4);
 370     MachineOperand &SID = MI->getOperand(5);
 371     unsigned TextureId = MI->getOperand(6).getImm();
 372     unsigned SrcX = 0, SrcY = 1, SrcZ = 2, SrcW = 3;
 373     unsigned CTX = 1, CTY = 1, CTZ = 1, CTW = 1;
 374
 375     switch (TextureId) {
 376     case 5: // Rect
 377       CTX = CTY = 0;
 378       break;
 379     case 6: // Shadow1D
 380       SrcW = SrcZ;
 381       break;
 382     case 7: // Shadow2D
 383       SrcW = SrcZ;
 384       break;
 385     case 8: // ShadowRect
 386       CTX = CTY = 0;
 387       SrcW = SrcZ;
 388       break;
 389     case 9: // 1DArray
 390       SrcZ = SrcY;
 391       CTZ = 0;
 392       break;
 393     case 10: // 2DArray
 394       CTZ = 0;
 395       break;
 396     case 11: // Shadow1DArray
 397       SrcZ = SrcY;
 398       CTZ = 0;
 399       break;
 400     case 12: // Shadow2DArray
 401       CTZ = 0;
 402       break;
 403     }
 404
 405     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), T0)
 406             .addOperand(MI->getOperand(3))
 407             .addImm(SrcX)
 408             .addImm(SrcY)
 409             .addImm(SrcZ)
 410             .addImm(SrcW)
 411             .addImm(0)
 412             .addImm(0)
 413             .addImm(0)
 414             .addImm(0)
 415             .addImm(1)
 416             .addImm(2)
 417             .addImm(3)
 418             .addOperand(RID)
 419             .addOperand(SID)
 420             .addImm(CTX)
 421             .addImm(CTY)
 422             .addImm(CTZ)
 423             .addImm(CTW);
 424     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), T1)
 425             .addOperand(MI->getOperand(2))
 426             .addImm(SrcX)
 427             .addImm(SrcY)
 428             .addImm(SrcZ)
 429             .addImm(SrcW)
 430             .addImm(0)
 431             .addImm(0)
 432             .addImm(0)
 433             .addImm(0)
 434             .addImm(1)
 435             .addImm(2)
 436             .addImm(3)
 437             .addOperand(RID)
 438             .addOperand(SID)
 439             .addImm(CTX)
 440             .addImm(CTY)
 441             .addImm(CTZ)
 442             .addImm(CTW);
 443     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_C_G))
 444             .addOperand(MI->getOperand(0))
 445             .addOperand(MI->getOperand(1))
 446             .addImm(SrcX)
 447             .addImm(SrcY)
 448             .addImm(SrcZ)
 449             .addImm(SrcW)
 450             .addImm(0)
 451             .addImm(0)
 452             .addImm(0)
 453             .addImm(0)
 454             .addImm(1)
 455             .addImm(2)
 456             .addImm(3)
 457             .addOperand(RID)
 458             .addOperand(SID)
 459             .addImm(CTX)
 460             .addImm(CTY)
 461             .addImm(CTZ)
 462             .addImm(CTW)
 463             .addReg(T0, RegState::Implicit)
 464             .addReg(T1, RegState::Implicit);
 465     break;
 466   }
 467
 468   case AMDGPU::BRANCH:
 469       BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP))
 470               .addOperand(MI->getOperand(0));
 471       break;
 472
 473   case AMDGPU::BRANCH_COND_f32: {
 474     MachineInstr *NewMI =
 475       BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X),
 476               AMDGPU::PREDICATE_BIT)
 477               .addOperand(MI->getOperand(1))
 478               .addImm(OPCODE_IS_NOT_ZERO)
 479               .addImm(0); // Flags
 480     TII->addFlag(NewMI, 0, MO_FLAG_PUSH);
 481     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND))
 482             .addOperand(MI->getOperand(0))
 483             .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
 484     break;
 485   }
 486
 487   case AMDGPU::BRANCH_COND_i32: {
 488     MachineInstr *NewMI =
 489       BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X),
 490             AMDGPU::PREDICATE_BIT)
 491             .addOperand(MI->getOperand(1))
 492             .addImm(OPCODE_IS_NOT_ZERO_INT)
 493             .addImm(0); // Flags
 494     TII->addFlag(NewMI, 0, MO_FLAG_PUSH);
 495     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND))
 496            .addOperand(MI->getOperand(0))
 497             .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
 498     break;
 499   }
 500
 501   case AMDGPU::EG_ExportSwz:
 502   case AMDGPU::R600_ExportSwz: {
 503     // Instruction is left unmodified if its not the last one of its type
 504     bool isLastInstructionOfItsType = true;
 505     unsigned InstExportType = MI->getOperand(1).getImm();
 506     for (MachineBasicBlock::iterator NextExportInst = std::next(I),
 507          EndBlock = BB->end(); NextExportInst != EndBlock;
 508          NextExportInst = std::next(NextExportInst)) {
 509       if (NextExportInst->getOpcode() == AMDGPU::EG_ExportSwz ||
 510           NextExportInst->getOpcode() == AMDGPU::R600_ExportSwz) {
 511         unsigned CurrentInstExportType = NextExportInst->getOperand(1)
 512             .getImm();
 513         if (CurrentInstExportType == InstExportType) {
 514           isLastInstructionOfItsType = false;
 515           break;
 516         }
 517       }
 518     }
 519     bool EOP = (std::next(I)->getOpcode() == AMDGPU::RETURN) ? 1 : 0;
 520     if (!EOP && !isLastInstructionOfItsType)
 521       return BB;
 522     unsigned CfInst = (MI->getOpcode() == AMDGPU::EG_ExportSwz)? 84 : 40;
 523     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode()))
 524             .addOperand(MI->getOperand(0))
 525             .addOperand(MI->getOperand(1))
 526             .addOperand(MI->getOperand(2))
 527             .addOperand(MI->getOperand(3))
 528             .addOperand(MI->getOperand(4))
 529             .addOperand(MI->getOperand(5))
 530             .addOperand(MI->getOperand(6))
 531             .addImm(CfInst)
 532             .addImm(EOP);
 533     break;
 534   }
 535   case AMDGPU::RETURN: {
 536     // RETURN instructions must have the live-out registers as implicit uses,
 537     // otherwise they appear dead.
 538     R600MachineFunctionInfo *MFI = MF->getInfo<R600MachineFunctionInfo>();
 539     MachineInstrBuilder MIB(*MF, MI);
 540     for (unsigned i = 0, e = MFI->LiveOuts.size(); i != e; ++i)
 541       MIB.addReg(MFI->LiveOuts[i], RegState::Implicit);
 542     return BB;
 543   }
 544   }
 545
 546   MI->eraseFromParent();
 547   return BB;
 548 }
 549
 550 //===----------------------------------------------------------------------===//
 551 // Custom DAG Lowering Operations
 552 //===----------------------------------------------------------------------===//
 553
 554 SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
 555   MachineFunction &MF = DAG.getMachineFunction();
 556   R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
 557   switch (Op.getOpcode()) {
 558   default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
 559   case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
 560   case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
 561   case ISD::SHL_PARTS: return LowerSHLParts(Op, DAG);
 562   case ISD::SRA_PARTS:
 563   case ISD::SRL_PARTS: return LowerSRXParts(Op, DAG);
 564   case ISD::FCOS:
 565   case ISD::FSIN: return LowerTrig(Op, DAG);
 566   case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
 567   case ISD::STORE: return LowerSTORE(Op, DAG);
 568   case ISD::LOAD: return LowerLOAD(Op, DAG);
 569   case ISD::GlobalAddress: return LowerGlobalAddress(MFI, Op, DAG);
 570   case ISD::INTRINSIC_VOID: {
 571     SDValue Chain = Op.getOperand(0);
 572     unsigned IntrinsicID =
 573                          cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
 574     switch (IntrinsicID) {
 575     case AMDGPUIntrinsic::AMDGPU_store_output: {
 576       int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
 577       unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister(RegIndex);
 578       MFI->LiveOuts.push_back(Reg);
 579       return DAG.getCopyToReg(Chain, SDLoc(Op), Reg, Op.getOperand(2));
 580     }
 581     case AMDGPUIntrinsic::R600_store_swizzle: {
 582       const SDValue Args[8] = {
 583         Chain,
 584         Op.getOperand(2), // Export Value
 585         Op.getOperand(3), // ArrayBase
 586         Op.getOperand(4), // Type
 587         DAG.getConstant(0, MVT::i32), // SWZ_X
 588         DAG.getConstant(1, MVT::i32), // SWZ_Y
 589         DAG.getConstant(2, MVT::i32), // SWZ_Z
 590         DAG.getConstant(3, MVT::i32) // SWZ_W
 591       };
 592       return DAG.getNode(AMDGPUISD::EXPORT, SDLoc(Op), Op.getValueType(), Args);
 593     }
 594
 595     // default for switch(IntrinsicID)
 596     default: break;
 597     }
 598     // break out of case ISD::INTRINSIC_VOID in switch(Op.getOpcode())
 599     break;
 600   }
 601   case ISD::INTRINSIC_WO_CHAIN: {
 602     unsigned IntrinsicID =
 603                          cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
 604     EVT VT = Op.getValueType();
 605     SDLoc DL(Op);
 606     switch(IntrinsicID) {
 607     default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
 608     case AMDGPUIntrinsic::R600_load_input: {
 609       int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
 610       unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister(RegIndex);
 611       MachineFunction &MF = DAG.getMachineFunction();
 612       MachineRegisterInfo &MRI = MF.getRegInfo();
 613       MRI.addLiveIn(Reg);
 614       return DAG.getCopyFromReg(DAG.getEntryNode(),
 615           SDLoc(DAG.getEntryNode()), Reg, VT);
 616     }
 617
 618     case AMDGPUIntrinsic::R600_interp_input: {
 619       int slot = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
 620       int ijb = cast<ConstantSDNode>(Op.getOperand(2))->getSExtValue();
 621       MachineSDNode *interp;
 622       if (ijb < 0) {
 623         const MachineFunction &MF = DAG.getMachineFunction();
 624         const R600InstrInfo *TII =
 625           static_cast<const R600InstrInfo*>(MF.getTarget().getInstrInfo());
 626         interp = DAG.getMachineNode(AMDGPU::INTERP_VEC_LOAD, DL,
 627             MVT::v4f32, DAG.getTargetConstant(slot / 4 , MVT::i32));
 628         return DAG.getTargetExtractSubreg(
 629             TII->getRegisterInfo().getSubRegFromChannel(slot % 4),
 630             DL, MVT::f32, SDValue(interp, 0));
 631       }
 632       MachineFunction &MF = DAG.getMachineFunction();
 633       MachineRegisterInfo &MRI = MF.getRegInfo();
 634       unsigned RegisterI = AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb);
 635       unsigned RegisterJ = AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb + 1);
 636       MRI.addLiveIn(RegisterI);
 637       MRI.addLiveIn(RegisterJ);
 638       SDValue RegisterINode = DAG.getCopyFromReg(DAG.getEntryNode(),
 639           SDLoc(DAG.getEntryNode()), RegisterI, MVT::f32);
 640       SDValue RegisterJNode = DAG.getCopyFromReg(DAG.getEntryNode(),
 641           SDLoc(DAG.getEntryNode()), RegisterJ, MVT::f32);
 642
 643       if (slot % 4 < 2)
 644         interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_XY, DL,
 645             MVT::f32, MVT::f32, DAG.getTargetConstant(slot / 4 , MVT::i32),
 646             RegisterJNode, RegisterINode);
 647       else
 648         interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_ZW, DL,
 649             MVT::f32, MVT::f32, DAG.getTargetConstant(slot / 4 , MVT::i32),
 650             RegisterJNode, RegisterINode);
 651       return SDValue(interp, slot % 2);
 652     }
 653     case AMDGPUIntrinsic::R600_interp_xy:
 654     case AMDGPUIntrinsic::R600_interp_zw: {
 655       int slot = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
 656       MachineSDNode *interp;
 657       SDValue RegisterINode = Op.getOperand(2);
 658       SDValue RegisterJNode = Op.getOperand(3);
 659
 660       if (IntrinsicID == AMDGPUIntrinsic::R600_interp_xy)
 661         interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_XY, DL,
 662             MVT::f32, MVT::f32, DAG.getTargetConstant(slot, MVT::i32),
 663             RegisterJNode, RegisterINode);
 664       else
 665         interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_ZW, DL,
 666             MVT::f32, MVT::f32, DAG.getTargetConstant(slot, MVT::i32),
 667             RegisterJNode, RegisterINode);
 668       return DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2f32,
 669           SDValue(interp, 0), SDValue(interp, 1));
 670     }
 671     case AMDGPUIntrinsic::R600_tex:
 672     case AMDGPUIntrinsic::R600_texc:
 673     case AMDGPUIntrinsic::R600_txl:
 674     case AMDGPUIntrinsic::R600_txlc:
 675     case AMDGPUIntrinsic::R600_txb:
 676     case AMDGPUIntrinsic::R600_txbc:
 677     case AMDGPUIntrinsic::R600_txf:
 678     case AMDGPUIntrinsic::R600_txq:
 679     case AMDGPUIntrinsic::R600_ddx:
 680     case AMDGPUIntrinsic::R600_ddy:
 681     case AMDGPUIntrinsic::R600_ldptr: {
 682       unsigned TextureOp;
 683       switch (IntrinsicID) {
 684       case AMDGPUIntrinsic::R600_tex:
 685         TextureOp = 0;
 686         break;
 687       case AMDGPUIntrinsic::R600_texc:
 688         TextureOp = 1;
 689         break;
 690       case AMDGPUIntrinsic::R600_txl:
 691         TextureOp = 2;
 692         break;
 693       case AMDGPUIntrinsic::R600_txlc:
 694         TextureOp = 3;
 695         break;
 696       case AMDGPUIntrinsic::R600_txb:
 697         TextureOp = 4;
 698         break;
 699       case AMDGPUIntrinsic::R600_txbc:
 700         TextureOp = 5;
 701         break;
 702       case AMDGPUIntrinsic::R600_txf:
 703         TextureOp = 6;
 704         break;
 705       case AMDGPUIntrinsic::R600_txq:
 706         TextureOp = 7;
 707         break;
 708       case AMDGPUIntrinsic::R600_ddx:
 709         TextureOp = 8;
 710         break;
 711       case AMDGPUIntrinsic::R600_ddy:
 712         TextureOp = 9;
 713         break;
 714       case AMDGPUIntrinsic::R600_ldptr:
 715         TextureOp = 10;
 716         break;
 717       default:
 718         llvm_unreachable("Unknow Texture Operation");
 719       }
 720
 721       SDValue TexArgs[19] = {
 722         DAG.getConstant(TextureOp, MVT::i32),
 723         Op.getOperand(1),
 724         DAG.getConstant(0, MVT::i32),
 725         DAG.getConstant(1, MVT::i32),
 726         DAG.getConstant(2, MVT::i32),
 727         DAG.getConstant(3, MVT::i32),
 728         Op.getOperand(2),
 729         Op.getOperand(3),
 730         Op.getOperand(4),
 731         DAG.getConstant(0, MVT::i32),
 732         DAG.getConstant(1, MVT::i32),
 733         DAG.getConstant(2, MVT::i32),
 734         DAG.getConstant(3, MVT::i32),
 735         Op.getOperand(5),
 736         Op.getOperand(6),
 737         Op.getOperand(7),
 738         Op.getOperand(8),
 739         Op.getOperand(9),
 740         Op.getOperand(10)
 741       };
 742       return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, DL, MVT::v4f32, TexArgs);
 743     }
 744     case AMDGPUIntrinsic::AMDGPU_dp4: {
 745       SDValue Args[8] = {
 746       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
 747           DAG.getConstant(0, MVT::i32)),
 748       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
 749           DAG.getConstant(0, MVT::i32)),
 750       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
 751           DAG.getConstant(1, MVT::i32)),
 752       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
 753           DAG.getConstant(1, MVT::i32)),
 754       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
 755           DAG.getConstant(2, MVT::i32)),
 756       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
 757           DAG.getConstant(2, MVT::i32)),
 758       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
 759           DAG.getConstant(3, MVT::i32)),
 760       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
 761           DAG.getConstant(3, MVT::i32))
 762       };
 763       return DAG.getNode(AMDGPUISD::DOT4, DL, MVT::f32, Args);
 764     }
 765
 766     case Intrinsic::r600_read_ngroups_x:
 767       return LowerImplicitParameter(DAG, VT, DL, 0);
 768     case Intrinsic::r600_read_ngroups_y:
 769       return LowerImplicitParameter(DAG, VT, DL, 1);
 770     case Intrinsic::r600_read_ngroups_z:
 771       return LowerImplicitParameter(DAG, VT, DL, 2);
 772     case Intrinsic::r600_read_global_size_x:
 773       return LowerImplicitParameter(DAG, VT, DL, 3);
 774     case Intrinsic::r600_read_global_size_y:
 775       return LowerImplicitParameter(DAG, VT, DL, 4);
 776     case Intrinsic::r600_read_global_size_z:
 777       return LowerImplicitParameter(DAG, VT, DL, 5);
 778     case Intrinsic::r600_read_local_size_x:
 779       return LowerImplicitParameter(DAG, VT, DL, 6);
 780     case Intrinsic::r600_read_local_size_y:
 781       return LowerImplicitParameter(DAG, VT, DL, 7);
 782     case Intrinsic::r600_read_local_size_z:
 783       return LowerImplicitParameter(DAG, VT, DL, 8);
 784
 785     case Intrinsic::r600_read_tgid_x:
 786       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 787                                   AMDGPU::T1_X, VT);
 788     case Intrinsic::r600_read_tgid_y:
 789       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 790                                   AMDGPU::T1_Y, VT);
 791     case Intrinsic::r600_read_tgid_z:
 792       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 793                                   AMDGPU::T1_Z, VT);
 794     case Intrinsic::r600_read_tidig_x:
 795       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 796                                   AMDGPU::T0_X, VT);
 797     case Intrinsic::r600_read_tidig_y:
 798       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 799                                   AMDGPU::T0_Y, VT);
 800     case Intrinsic::r600_read_tidig_z:
 801       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 802                                   AMDGPU::T0_Z, VT);
 803     }
 804     // break out of case ISD::INTRINSIC_WO_CHAIN in switch(Op.getOpcode())
 805     break;
 806   }
 807   } // end switch(Op.getOpcode())
 808   return SDValue();
 809 }
 810
 811 void R600TargetLowering::ReplaceNodeResults(SDNode *N,
 812                                             SmallVectorImpl<SDValue> &Results,
 813                                             SelectionDAG &DAG) const {
 814   switch (N->getOpcode()) {
 815   default:
 816     AMDGPUTargetLowering::ReplaceNodeResults(N, Results, DAG);
 817     return;
 818   case ISD::FP_TO_UINT: Results.push_back(LowerFPTOUINT(N->getOperand(0), DAG));
 819     return;
 820   case ISD::LOAD: {
 821     SDNode *Node = LowerLOAD(SDValue(N, 0), DAG).getNode();
 822     Results.push_back(SDValue(Node, 0));
 823     Results.push_back(SDValue(Node, 1));
 824     // XXX: LLVM seems not to replace Chain Value inside CustomWidenLowerNode
 825     // function
 826     DAG.ReplaceAllUsesOfValueWith(SDValue(N,1), SDValue(Node, 1));
 827     return;
 828   }
 829   case ISD::STORE:
 830     SDNode *Node = LowerSTORE(SDValue(N, 0), DAG).getNode();
 831     Results.push_back(SDValue(Node, 0));
 832     return;
 833   }
 834 }
 835
 836 SDValue R600TargetLowering::vectorToVerticalVector(SelectionDAG &DAG,
 837                                                    SDValue Vector) const {
 838
 839   SDLoc DL(Vector);
 840   EVT VecVT = Vector.getValueType();
 841   EVT EltVT = VecVT.getVectorElementType();
 842   SmallVector<SDValue, 8> Args;
 843
 844   for (unsigned i = 0, e = VecVT.getVectorNumElements();
 845                                                            i != e; ++i) {
 846     Args.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT,
 847                                Vector, DAG.getConstant(i, getVectorIdxTy())));
 848   }
 849
 850   return DAG.getNode(AMDGPUISD::BUILD_VERTICAL_VECTOR, DL, VecVT, Args);
 851 }
 852
 853 SDValue R600TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
 854                                                     SelectionDAG &DAG) const {
 855
 856   SDLoc DL(Op);
 857   SDValue Vector = Op.getOperand(0);
 858   SDValue Index = Op.getOperand(1);
 859
 860   if (isa<ConstantSDNode>(Index) ||
 861       Vector.getOpcode() == AMDGPUISD::BUILD_VERTICAL_VECTOR)
 862     return Op;
 863
 864   Vector = vectorToVerticalVector(DAG, Vector);
 865   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Op.getValueType(),
 866                      Vector, Index);
 867 }
 868
 869 SDValue R600TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
 870                                                    SelectionDAG &DAG) const {
 871   SDLoc DL(Op);
 872   SDValue Vector = Op.getOperand(0);
 873   SDValue Value = Op.getOperand(1);
 874   SDValue Index = Op.getOperand(2);
 875
 876   if (isa<ConstantSDNode>(Index) ||
 877       Vector.getOpcode() == AMDGPUISD::BUILD_VERTICAL_VECTOR)
 878     return Op;
 879
 880   Vector = vectorToVerticalVector(DAG, Vector);
 881   SDValue Insert = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, Op.getValueType(),
 882                                Vector, Value, Index);
 883   return vectorToVerticalVector(DAG, Insert);
 884 }
 885
 886 SDValue R600TargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
 887   // On hw >= R700, COS/SIN input must be between -1. and 1.
 888   // Thus we lower them to TRIG ( FRACT ( x / 2Pi + 0.5) - 0.5)
 889   EVT VT = Op.getValueType();
 890   SDValue Arg = Op.getOperand(0);
 891   SDValue FractPart = DAG.getNode(AMDGPUISD::FRACT, SDLoc(Op), VT,
 892       DAG.getNode(ISD::FADD, SDLoc(Op), VT,
 893         DAG.getNode(ISD::FMUL, SDLoc(Op), VT, Arg,
 894           DAG.getConstantFP(0.15915494309, MVT::f32)),
 895         DAG.getConstantFP(0.5, MVT::f32)));
 896   unsigned TrigNode;
 897   switch (Op.getOpcode()) {
 898   case ISD::FCOS:
 899     TrigNode = AMDGPUISD::COS_HW;
 900     break;
 901   case ISD::FSIN:
 902     TrigNode = AMDGPUISD::SIN_HW;
 903     break;
 904   default:
 905     llvm_unreachable("Wrong trig opcode");
 906   }
 907   SDValue TrigVal = DAG.getNode(TrigNode, SDLoc(Op), VT,
 908       DAG.getNode(ISD::FADD, SDLoc(Op), VT, FractPart,
 909         DAG.getConstantFP(-0.5, MVT::f32)));
 910   if (Gen >= AMDGPUSubtarget::R700)
 911     return TrigVal;
 912   // On R600 hw, COS/SIN input must be between -Pi and Pi.
 913   return DAG.getNode(ISD::FMUL, SDLoc(Op), VT, TrigVal,
 914       DAG.getConstantFP(3.14159265359, MVT::f32));
 915 }
 916
 917 SDValue R600TargetLowering::LowerSHLParts(SDValue Op, SelectionDAG &DAG) const {
 918   SDLoc DL(Op);
 919   EVT VT = Op.getValueType();
 920
 921   SDValue Lo = Op.getOperand(0);
 922   SDValue Hi = Op.getOperand(1);
 923   SDValue Shift = Op.getOperand(2);
 924   SDValue Zero = DAG.getConstant(0, VT);
 925   SDValue One  = DAG.getConstant(1, VT);
 926
 927   SDValue Width  = DAG.getConstant(VT.getSizeInBits(), VT);
 928   SDValue Width1 = DAG.getConstant(VT.getSizeInBits() - 1, VT);
 929   SDValue BigShift  = DAG.getNode(ISD::SUB, DL, VT, Shift, Width);
 930   SDValue CompShift = DAG.getNode(ISD::SUB, DL, VT, Width1, Shift);
 931
 932   // The dance around Width1 is necessary for 0 special case.
 933   // Without it the CompShift might be 32, producing incorrect results in
 934   // Overflow. So we do the shift in two steps, the alternative is to
 935   // add a conditional to filter the special case.
 936
 937   SDValue Overflow = DAG.getNode(ISD::SRL, DL, VT, Lo, CompShift);
 938   Overflow = DAG.getNode(ISD::SRL, DL, VT, Overflow, One);
 939
 940   SDValue HiSmall = DAG.getNode(ISD::SHL, DL, VT, Hi, Shift);
 941   HiSmall = DAG.getNode(ISD::OR, DL, VT, HiSmall, Overflow);
 942   SDValue LoSmall = DAG.getNode(ISD::SHL, DL, VT, Lo, Shift);
 943
 944   SDValue HiBig = DAG.getNode(ISD::SHL, DL, VT, Lo, BigShift);
 945   SDValue LoBig = Zero;
 946
 947   Hi = DAG.getSelectCC(DL, Shift, Width, HiSmall, HiBig, ISD::SETULT);
 948   Lo = DAG.getSelectCC(DL, Shift, Width, LoSmall, LoBig, ISD::SETULT);
 949
 950   return DAG.getNode(ISD::MERGE_VALUES, DL, DAG.getVTList(VT,VT), Lo, Hi);
 951 }
 952
 953 SDValue R600TargetLowering::LowerSRXParts(SDValue Op, SelectionDAG &DAG) const {
 954   SDLoc DL(Op);
 955   EVT VT = Op.getValueType();
 956
 957   SDValue Lo = Op.getOperand(0);
 958   SDValue Hi = Op.getOperand(1);
 959   SDValue Shift = Op.getOperand(2);
 960   SDValue Zero = DAG.getConstant(0, VT);
 961   SDValue One  = DAG.getConstant(1, VT);
 962
 963   const bool SRA = Op.getOpcode() == ISD::SRA_PARTS;
 964
 965   SDValue Width  = DAG.getConstant(VT.getSizeInBits(), VT);
 966   SDValue Width1 = DAG.getConstant(VT.getSizeInBits() - 1, VT);
 967   SDValue BigShift  = DAG.getNode(ISD::SUB, DL, VT, Shift, Width);
 968   SDValue CompShift = DAG.getNode(ISD::SUB, DL, VT, Width1, Shift);
 969
 970   // The dance around Width1 is necessary for 0 special case.
 971   // Without it the CompShift might be 32, producing incorrect results in
 972   // Overflow. So we do the shift in two steps, the alternative is to
 973   // add a conditional to filter the special case.
 974
 975   SDValue Overflow = DAG.getNode(ISD::SHL, DL, VT, Hi, CompShift);
 976   Overflow = DAG.getNode(ISD::SHL, DL, VT, Overflow, One);
 977
 978   SDValue HiSmall = DAG.getNode(SRA ? ISD::SRA : ISD::SRL, DL, VT, Hi, Shift);
 979   SDValue LoSmall = DAG.getNode(ISD::SRL, DL, VT, Lo, Shift);
 980   LoSmall = DAG.getNode(ISD::OR, DL, VT, LoSmall, Overflow);
 981
 982   SDValue LoBig = DAG.getNode(SRA ? ISD::SRA : ISD::SRL, DL, VT, Hi, BigShift);
 983   SDValue HiBig = SRA ? DAG.getNode(ISD::SRA, DL, VT, Hi, Width1) : Zero;
 984
 985   Hi = DAG.getSelectCC(DL, Shift, Width, HiSmall, HiBig, ISD::SETULT);
 986   Lo = DAG.getSelectCC(DL, Shift, Width, LoSmall, LoBig, ISD::SETULT);
 987
 988   return DAG.getNode(ISD::MERGE_VALUES, DL, DAG.getVTList(VT,VT), Lo, Hi);
 989 }
 990
 991 SDValue R600TargetLowering::LowerFPTOUINT(SDValue Op, SelectionDAG &DAG) const {
 992   return DAG.getNode(
 993       ISD::SETCC,
 994       SDLoc(Op),
 995       MVT::i1,
 996       Op, DAG.getConstantFP(0.0f, MVT::f32),
 997       DAG.getCondCode(ISD::SETNE)
 998       );
 999 }
1000
1001 SDValue R600TargetLowering::LowerImplicitParameter(SelectionDAG &DAG, EVT VT,
1002                                                    SDLoc DL,
1003                                                    unsigned DwordOffset) const {
1004   unsigned ByteOffset = DwordOffset * 4;
1005   PointerType * PtrType = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
1006                                       AMDGPUAS::CONSTANT_BUFFER_0);
1007
1008   // We shouldn't be using an offset wider than 16-bits for implicit parameters.
1009   assert(isInt<16>(ByteOffset));
1010
1011   return DAG.getLoad(VT, DL, DAG.getEntryNode(),
1012                      DAG.getConstant(ByteOffset, MVT::i32), // PTR
1013                      MachinePointerInfo(ConstantPointerNull::get(PtrType)),
1014                      false, false, false, 0);
1015 }
1016
1017 bool R600TargetLowering::isZero(SDValue Op) const {
1018   if(ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
1019     return Cst->isNullValue();
1020   } else if(ConstantFPSDNode *CstFP = dyn_cast<ConstantFPSDNode>(Op)){
1021     return CstFP->isZero();
1022   } else {
1023     return false;
1024   }
1025 }
1026
1027 SDValue R600TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
1028   SDLoc DL(Op);
1029   EVT VT = Op.getValueType();
1030
1031   SDValue LHS = Op.getOperand(0);
1032   SDValue RHS = Op.getOperand(1);
1033   SDValue True = Op.getOperand(2);
1034   SDValue False = Op.getOperand(3);
1035   SDValue CC = Op.getOperand(4);
1036   SDValue Temp;
1037
1038   // LHS and RHS are guaranteed to be the same value type
1039   EVT CompareVT = LHS.getValueType();
1040
1041   // Check if we can lower this to a native operation.
1042
1043   // Try to lower to a SET* instruction:
1044   //
1045   // SET* can match the following patterns:
1046   //
1047   // select_cc f32, f32, -1,  0, cc_supported
1048   // select_cc f32, f32, 1.0f, 0.0f, cc_supported
1049   // select_cc i32, i32, -1,  0, cc_supported
1050   //
1051
1052   // Move hardware True/False values to the correct operand.
1053   ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
1054   ISD::CondCode InverseCC =
1055      ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32);
1056   if (isHWTrueValue(False) && isHWFalseValue(True)) {
1057     if (isCondCodeLegal(InverseCC, CompareVT.getSimpleVT())) {
1058       std::swap(False, True);
1059       CC = DAG.getCondCode(InverseCC);
1060     } else {
1061       ISD::CondCode SwapInvCC = ISD::getSetCCSwappedOperands(InverseCC);
1062       if (isCondCodeLegal(SwapInvCC, CompareVT.getSimpleVT())) {
1063         std::swap(False, True);
1064         std::swap(LHS, RHS);
1065         CC = DAG.getCondCode(SwapInvCC);
1066       }
1067     }
1068   }
1069
1070   if (isHWTrueValue(True) && isHWFalseValue(False) &&
1071       (CompareVT == VT || VT == MVT::i32)) {
1072     // This can be matched by a SET* instruction.
1073     return DAG.getNode(ISD::SELECT_CC, DL, VT, LHS, RHS, True, False, CC);
1074   }
1075
1076   // Try to lower to a CND* instruction:
1077   //
1078   // CND* can match the following patterns:
1079   //
1080   // select_cc f32, 0.0, f32, f32, cc_supported
1081   // select_cc f32, 0.0, i32, i32, cc_supported
1082   // select_cc i32, 0,   f32, f32, cc_supported
1083   // select_cc i32, 0,   i32, i32, cc_supported
1084   //
1085
1086   // Try to move the zero value to the RHS
1087   if (isZero(LHS)) {
1088     ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
1089     // Try swapping the operands
1090     ISD::CondCode CCSwapped = ISD::getSetCCSwappedOperands(CCOpcode);
1091     if (isCondCodeLegal(CCSwapped, CompareVT.getSimpleVT())) {
1092       std::swap(LHS, RHS);
1093       CC = DAG.getCondCode(CCSwapped);
1094     } else {
1095       // Try inverting the conditon and then swapping the operands
1096       ISD::CondCode CCInv = ISD::getSetCCInverse(CCOpcode, CompareVT.isInteger());
1097       CCSwapped = ISD::getSetCCSwappedOperands(CCInv);
1098       if (isCondCodeLegal(CCSwapped, CompareVT.getSimpleVT())) {
1099         std::swap(True, False);
1100         std::swap(LHS, RHS);
1101         CC = DAG.getCondCode(CCSwapped);
1102       }
1103     }
1104   }
1105   if (isZero(RHS)) {
1106     SDValue Cond = LHS;
1107     SDValue Zero = RHS;
1108     ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
1109     if (CompareVT != VT) {
1110       // Bitcast True / False to the correct types.  This will end up being
1111       // a nop, but it allows us to define only a single pattern in the
1112       // .TD files for each CND* instruction rather than having to have
1113       // one pattern for integer True/False and one for fp True/False
1114       True = DAG.getNode(ISD::BITCAST, DL, CompareVT, True);
1115       False = DAG.getNode(ISD::BITCAST, DL, CompareVT, False);
1116     }
1117
1118     switch (CCOpcode) {
1119     case ISD::SETONE:
1120     case ISD::SETUNE:
1121     case ISD::SETNE:
1122       CCOpcode = ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32);
1123       Temp = True;
1124       True = False;
1125       False = Temp;
1126       break;
1127     default:
1128       break;
1129     }
1130     SDValue SelectNode = DAG.getNode(ISD::SELECT_CC, DL, CompareVT,
1131         Cond, Zero,
1132         True, False,
1133         DAG.getCondCode(CCOpcode));
1134     return DAG.getNode(ISD::BITCAST, DL, VT, SelectNode);
1135   }
1136
1137   // If we make it this for it means we have no native instructions to handle
1138   // this SELECT_CC, so we must lower it.
1139   SDValue HWTrue, HWFalse;
1140
1141   if (CompareVT == MVT::f32) {
1142     HWTrue = DAG.getConstantFP(1.0f, CompareVT);
1143     HWFalse = DAG.getConstantFP(0.0f, CompareVT);
1144   } else if (CompareVT == MVT::i32) {
1145     HWTrue = DAG.getConstant(-1, CompareVT);
1146     HWFalse = DAG.getConstant(0, CompareVT);
1147   }
1148   else {
1149     llvm_unreachable("Unhandled value type in LowerSELECT_CC");
1150   }
1151
1152   // Lower this unsupported SELECT_CC into a combination of two supported
1153   // SELECT_CC operations.
1154   SDValue Cond = DAG.getNode(ISD::SELECT_CC, DL, CompareVT, LHS, RHS, HWTrue, HWFalse, CC);
1155
1156   return DAG.getNode(ISD::SELECT_CC, DL, VT,
1157       Cond, HWFalse,
1158       True, False,
1159       DAG.getCondCode(ISD::SETNE));
1160 }
1161
1162 /// LLVM generates byte-addressed pointers.  For indirect addressing, we need to
1163 /// convert these pointers to a register index.  Each register holds
1164 /// 16 bytes, (4 x 32bit sub-register), but we need to take into account the
1165 /// \p StackWidth, which tells us how many of the 4 sub-registrers will be used
1166 /// for indirect addressing.
1167 SDValue R600TargetLowering::stackPtrToRegIndex(SDValue Ptr,
1168                                                unsigned StackWidth,
1169                                                SelectionDAG &DAG) const {
1170   unsigned SRLPad;
1171   switch(StackWidth) {
1172   case 1:
1173     SRLPad = 2;
1174     break;
1175   case 2:
1176     SRLPad = 3;
1177     break;
1178   case 4:
1179     SRLPad = 4;
1180     break;
1181   default: llvm_unreachable("Invalid stack width");
1182   }
1183
1184   return DAG.getNode(ISD::SRL, SDLoc(Ptr), Ptr.getValueType(), Ptr,
1185                      DAG.getConstant(SRLPad, MVT::i32));
1186 }
1187
1188 void R600TargetLowering::getStackAddress(unsigned StackWidth,
1189                                          unsigned ElemIdx,
1190                                          unsigned &Channel,
1191                                          unsigned &PtrIncr) const {
1192   switch (StackWidth) {
1193   default:
1194   case 1:
1195     Channel = 0;
1196     if (ElemIdx > 0) {
1197       PtrIncr = 1;
1198     } else {
1199       PtrIncr = 0;
1200     }
1201     break;
1202   case 2:
1203     Channel = ElemIdx % 2;
1204     if (ElemIdx == 2) {
1205       PtrIncr = 1;
1206     } else {
1207       PtrIncr = 0;
1208     }
1209     break;
1210   case 4:
1211     Channel = ElemIdx;
1212     PtrIncr = 0;
1213     break;
1214   }
1215 }
1216
1217 SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
1218   SDLoc DL(Op);
1219   StoreSDNode *StoreNode = cast<StoreSDNode>(Op);
1220   SDValue Chain = Op.getOperand(0);
1221   SDValue Value = Op.getOperand(1);
1222   SDValue Ptr = Op.getOperand(2);
1223
1224   SDValue Result = AMDGPUTargetLowering::LowerSTORE(Op, DAG);
1225   if (Result.getNode()) {
1226     return Result;
1227   }
1228
1229   if (StoreNode->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS) {
1230     if (StoreNode->isTruncatingStore()) {
1231       EVT VT = Value.getValueType();
1232       assert(VT.bitsLE(MVT::i32));
1233       EVT MemVT = StoreNode->getMemoryVT();
1234       SDValue MaskConstant;
1235       if (MemVT == MVT::i8) {
1236         MaskConstant = DAG.getConstant(0xFF, MVT::i32);
1237       } else {
1238         assert(MemVT == MVT::i16);
1239         MaskConstant = DAG.getConstant(0xFFFF, MVT::i32);
1240       }
1241       SDValue DWordAddr = DAG.getNode(ISD::SRL, DL, VT, Ptr,
1242                                       DAG.getConstant(2, MVT::i32));
1243       SDValue ByteIndex = DAG.getNode(ISD::AND, DL, Ptr.getValueType(), Ptr,
1244                                       DAG.getConstant(0x00000003, VT));
1245       SDValue TruncValue = DAG.getNode(ISD::AND, DL, VT, Value, MaskConstant);
1246       SDValue Shift = DAG.getNode(ISD::SHL, DL, VT, ByteIndex,
1247                                    DAG.getConstant(3, VT));
1248       SDValue ShiftedValue = DAG.getNode(ISD::SHL, DL, VT, TruncValue, Shift);
1249       SDValue Mask = DAG.getNode(ISD::SHL, DL, VT, MaskConstant, Shift);
1250       // XXX: If we add a 64-bit ZW register class, then we could use a 2 x i32
1251       // vector instead.
1252       SDValue Src[4] = {
1253         ShiftedValue,
1254         DAG.getConstant(0, MVT::i32),
1255         DAG.getConstant(0, MVT::i32),
1256         Mask
1257       };
1258       SDValue Input = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v4i32, Src);
1259       SDValue Args[3] = { Chain, Input, DWordAddr };
1260       return DAG.getMemIntrinsicNode(AMDGPUISD::STORE_MSKOR, DL,
1261                                      Op->getVTList(), Args, MemVT,
1262                                      StoreNode->getMemOperand());
1263     } else if (Ptr->getOpcode() != AMDGPUISD::DWORDADDR &&
1264                Value.getValueType().bitsGE(MVT::i32)) {
1265       // Convert pointer from byte address to dword address.
1266       Ptr = DAG.getNode(AMDGPUISD::DWORDADDR, DL, Ptr.getValueType(),
1267                         DAG.getNode(ISD::SRL, DL, Ptr.getValueType(),
1268                                     Ptr, DAG.getConstant(2, MVT::i32)));
1269
1270       if (StoreNode->isTruncatingStore() || StoreNode->isIndexed()) {
1271         llvm_unreachable("Truncated and indexed stores not supported yet");
1272       } else {
1273         Chain = DAG.getStore(Chain, DL, Value, Ptr, StoreNode->getMemOperand());
1274       }
1275       return Chain;
1276     }
1277   }
1278
1279   EVT ValueVT = Value.getValueType();
1280
1281   if (StoreNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) {
1282     return SDValue();
1283   }
1284
1285   SDValue Ret = AMDGPUTargetLowering::LowerSTORE(Op, DAG);
1286   if (Ret.getNode()) {
1287     return Ret;
1288   }
1289   // Lowering for indirect addressing
1290
1291   const MachineFunction &MF = DAG.getMachineFunction();
1292   const AMDGPUFrameLowering *TFL = static_cast<const AMDGPUFrameLowering*>(
1293                                          getTargetMachine().getFrameLowering());
1294   unsigned StackWidth = TFL->getStackWidth(MF);
1295
1296   Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG);
1297
1298   if (ValueVT.isVector()) {
1299     unsigned NumElemVT = ValueVT.getVectorNumElements();
1300     EVT ElemVT = ValueVT.getVectorElementType();
1301     SmallVector<SDValue, 4> Stores(NumElemVT);
1302
1303     assert(NumElemVT >= StackWidth && "Stack width cannot be greater than "
1304                                       "vector width in load");
1305
1306     for (unsigned i = 0; i < NumElemVT; ++i) {
1307       unsigned Channel, PtrIncr;
1308       getStackAddress(StackWidth, i, Channel, PtrIncr);
1309       Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr,
1310                         DAG.getConstant(PtrIncr, MVT::i32));
1311       SDValue Elem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ElemVT,
1312                                  Value, DAG.getConstant(i, MVT::i32));
1313
1314       Stores[i] = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other,
1315                               Chain, Elem, Ptr,
1316                               DAG.getTargetConstant(Channel, MVT::i32));
1317     }
1318      Chain =  DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores);
1319    } else {
1320     if (ValueVT == MVT::i8) {
1321       Value = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Value);
1322     }
1323     Chain = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other, Chain, Value, Ptr,
1324     DAG.getTargetConstant(0, MVT::i32)); // Channel
1325   }
1326
1327   return Chain;
1328 }
1329
1330 // return (512 + (kc_bank << 12)
1331 static int
1332 ConstantAddressBlock(unsigned AddressSpace) {
1333   switch (AddressSpace) {
1334   case AMDGPUAS::CONSTANT_BUFFER_0:
1335     return 512;
1336   case AMDGPUAS::CONSTANT_BUFFER_1:
1337     return 512 + 4096;
1338   case AMDGPUAS::CONSTANT_BUFFER_2:
1339     return 512 + 4096 * 2;
1340   case AMDGPUAS::CONSTANT_BUFFER_3:
1341     return 512 + 4096 * 3;
1342   case AMDGPUAS::CONSTANT_BUFFER_4:
1343     return 512 + 4096 * 4;
1344   case AMDGPUAS::CONSTANT_BUFFER_5:
1345     return 512 + 4096 * 5;
1346   case AMDGPUAS::CONSTANT_BUFFER_6:
1347     return 512 + 4096 * 6;
1348   case AMDGPUAS::CONSTANT_BUFFER_7:
1349     return 512 + 4096 * 7;
1350   case AMDGPUAS::CONSTANT_BUFFER_8:
1351     return 512 + 4096 * 8;
1352   case AMDGPUAS::CONSTANT_BUFFER_9:
1353     return 512 + 4096 * 9;
1354   case AMDGPUAS::CONSTANT_BUFFER_10:
1355     return 512 + 4096 * 10;
1356   case AMDGPUAS::CONSTANT_BUFFER_11:
1357     return 512 + 4096 * 11;
1358   case AMDGPUAS::CONSTANT_BUFFER_12:
1359     return 512 + 4096 * 12;
1360   case AMDGPUAS::CONSTANT_BUFFER_13:
1361     return 512 + 4096 * 13;
1362   case AMDGPUAS::CONSTANT_BUFFER_14:
1363     return 512 + 4096 * 14;
1364   case AMDGPUAS::CONSTANT_BUFFER_15:
1365     return 512 + 4096 * 15;
1366   default:
1367     return -1;
1368   }
1369 }
1370
1371 SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const
1372 {
1373   EVT VT = Op.getValueType();
1374   SDLoc DL(Op);
1375   LoadSDNode *LoadNode = cast<LoadSDNode>(Op);
1376   SDValue Chain = Op.getOperand(0);
1377   SDValue Ptr = Op.getOperand(1);
1378   SDValue LoweredLoad;
1379
1380   SDValue Ret = AMDGPUTargetLowering::LowerLOAD(Op, DAG);
1381   if (Ret.getNode()) {
1382     SDValue Ops[2] = {
1383       Ret,
1384       Chain
1385     };
1386     return DAG.getMergeValues(Ops, DL);
1387   }
1388
1389
1390   if (LoadNode->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS && VT.isVector()) {
1391     SDValue MergedValues[2] = {
1392       SplitVectorLoad(Op, DAG),
1393       Chain
1394     };
1395     return DAG.getMergeValues(MergedValues, DL);
1396   }
1397
1398   int ConstantBlock = ConstantAddressBlock(LoadNode->getAddressSpace());
1399   if (ConstantBlock > -1 &&
1400       ((LoadNode->getExtensionType() == ISD::NON_EXTLOAD) ||
1401        (LoadNode->getExtensionType() == ISD::ZEXTLOAD))) {
1402     SDValue Result;
1403     if (isa<ConstantExpr>(LoadNode->getMemOperand()->getValue()) ||
1404         isa<Constant>(LoadNode->getMemOperand()->getValue()) ||
1405         isa<ConstantSDNode>(Ptr)) {
1406       SDValue Slots[4];
1407       for (unsigned i = 0; i < 4; i++) {
1408         // We want Const position encoded with the following formula :
1409         // (((512 + (kc_bank << 12) + const_index) << 2) + chan)
1410         // const_index is Ptr computed by llvm using an alignment of 16.
1411         // Thus we add (((512 + (kc_bank << 12)) + chan ) * 4 here and
1412         // then div by 4 at the ISel step
1413         SDValue NewPtr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
1414             DAG.getConstant(4 * i + ConstantBlock * 16, MVT::i32));
1415         Slots[i] = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::i32, NewPtr);
1416       }
1417       EVT NewVT = MVT::v4i32;
1418       unsigned NumElements = 4;
1419       if (VT.isVector()) {
1420         NewVT = VT;
1421         NumElements = VT.getVectorNumElements();
1422       }
1423       Result = DAG.getNode(ISD::BUILD_VECTOR, DL, NewVT,
1424                            makeArrayRef(Slots, NumElements));
1425     } else {
1426       // non-constant ptr can't be folded, keeps it as a v4f32 load
1427       Result = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::v4i32,
1428           DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr, DAG.getConstant(4, MVT::i32)),
1429           DAG.getConstant(LoadNode->getAddressSpace() -
1430                           AMDGPUAS::CONSTANT_BUFFER_0, MVT::i32)
1431           );
1432     }
1433
1434     if (!VT.isVector()) {
1435       Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Result,
1436           DAG.getConstant(0, MVT::i32));
1437     }
1438
1439     SDValue MergedValues[2] = {
1440       Result,
1441       Chain
1442     };
1443     return DAG.getMergeValues(MergedValues, DL);
1444   }
1445
1446   // For most operations returning SDValue() will result in the node being
1447   // expanded by the DAG Legalizer. This is not the case for ISD::LOAD, so we
1448   // need to manually expand loads that may be legal in some address spaces and
1449   // illegal in others. SEXT loads from CONSTANT_BUFFER_0 are supported for
1450   // compute shaders, since the data is sign extended when it is uploaded to the
1451   // buffer. However SEXT loads from other address spaces are not supported, so
1452   // we need to expand them here.
1453   if (LoadNode->getExtensionType() == ISD::SEXTLOAD) {
1454     EVT MemVT = LoadNode->getMemoryVT();
1455     assert(!MemVT.isVector() && (MemVT == MVT::i16 || MemVT == MVT::i8));
1456     SDValue ShiftAmount =
1457           DAG.getConstant(VT.getSizeInBits() - MemVT.getSizeInBits(), MVT::i32);
1458     SDValue NewLoad = DAG.getExtLoad(ISD::EXTLOAD, DL, VT, Chain, Ptr,
1459                                   LoadNode->getPointerInfo(), MemVT,
1460                                   LoadNode->isVolatile(),
1461                                   LoadNode->isNonTemporal(),
1462                                   LoadNode->getAlignment());
1463     SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, NewLoad, ShiftAmount);
1464     SDValue Sra = DAG.getNode(ISD::SRA, DL, VT, Shl, ShiftAmount);
1465
1466     SDValue MergedValues[2] = { Sra, Chain };
1467     return DAG.getMergeValues(MergedValues, DL);
1468   }
1469
1470   if (LoadNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) {
1471     return SDValue();
1472   }
1473
1474   // Lowering for indirect addressing
1475   const MachineFunction &MF = DAG.getMachineFunction();
1476   const AMDGPUFrameLowering *TFL = static_cast<const AMDGPUFrameLowering*>(
1477                                          getTargetMachine().getFrameLowering());
1478   unsigned StackWidth = TFL->getStackWidth(MF);
1479
1480   Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG);
1481
1482   if (VT.isVector()) {
1483     unsigned NumElemVT = VT.getVectorNumElements();
1484     EVT ElemVT = VT.getVectorElementType();
1485     SDValue Loads[4];
1486
1487     assert(NumElemVT >= StackWidth && "Stack width cannot be greater than "
1488                                       "vector width in load");
1489
1490     for (unsigned i = 0; i < NumElemVT; ++i) {
1491       unsigned Channel, PtrIncr;
1492       getStackAddress(StackWidth, i, Channel, PtrIncr);
1493       Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr,
1494                         DAG.getConstant(PtrIncr, MVT::i32));
1495       Loads[i] = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, ElemVT,
1496                              Chain, Ptr,
1497                              DAG.getTargetConstant(Channel, MVT::i32),
1498                              Op.getOperand(2));
1499     }
1500     for (unsigned i = NumElemVT; i < 4; ++i) {
1501       Loads[i] = DAG.getUNDEF(ElemVT);
1502     }
1503     EVT TargetVT = EVT::getVectorVT(*DAG.getContext(), ElemVT, 4);
1504     LoweredLoad = DAG.getNode(ISD::BUILD_VECTOR, DL, TargetVT, Loads);
1505   } else {
1506     LoweredLoad = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, VT,
1507                               Chain, Ptr,
1508                               DAG.getTargetConstant(0, MVT::i32), // Channel
1509                               Op.getOperand(2));
1510   }
1511
1512   SDValue Ops[2] = {
1513     LoweredLoad,
1514     Chain
1515   };
1516
1517   return DAG.getMergeValues(Ops, DL);
1518 }
1519
1520 /// XXX Only kernel functions are supported, so we can assume for now that
1521 /// every function is a kernel function, but in the future we should use
1522 /// separate calling conventions for kernel and non-kernel functions.
1523 SDValue R600TargetLowering::LowerFormalArguments(
1524                                       SDValue Chain,
1525                                       CallingConv::ID CallConv,
1526                                       bool isVarArg,
1527                                       const SmallVectorImpl<ISD::InputArg> &Ins,
1528                                       SDLoc DL, SelectionDAG &DAG,
1529                                       SmallVectorImpl<SDValue> &InVals) const {
1530   SmallVector<CCValAssign, 16> ArgLocs;
1531   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
1532                  getTargetMachine(), ArgLocs, *DAG.getContext());
1533   MachineFunction &MF = DAG.getMachineFunction();
1534   unsigned ShaderType = MF.getInfo<R600MachineFunctionInfo>()->ShaderType;
1535
1536   SmallVector<ISD::InputArg, 8> LocalIns;
1537
1538   getOriginalFunctionArgs(DAG, MF.getFunction(), Ins, LocalIns);
1539
1540   AnalyzeFormalArguments(CCInfo, LocalIns);
1541
1542   for (unsigned i = 0, e = Ins.size(); i < e; ++i) {
1543     CCValAssign &VA = ArgLocs[i];
1544     EVT VT = Ins[i].VT;
1545     EVT MemVT = LocalIns[i].VT;
1546
1547     if (ShaderType != ShaderType::COMPUTE) {
1548       unsigned Reg = MF.addLiveIn(VA.getLocReg(), &AMDGPU::R600_Reg128RegClass);
1549       SDValue Register = DAG.getCopyFromReg(Chain, DL, Reg, VT);
1550       InVals.push_back(Register);
1551       continue;
1552     }
1553
1554     PointerType *PtrTy = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
1555                                                    AMDGPUAS::CONSTANT_BUFFER_0);
1556
1557     // i64 isn't a legal type, so the register type used ends up as i32, which
1558     // isn't expected here. It attempts to create this sextload, but it ends up
1559     // being invalid. Somehow this seems to work with i64 arguments, but breaks
1560     // for <1 x i64>.
1561
1562     // The first 36 bytes of the input buffer contains information about
1563     // thread group and global sizes.
1564
1565     // FIXME: This should really check the extload type, but the handling of
1566     // extload vecto parameters seems to be broken.
1567     //ISD::LoadExtType Ext = Ins[i].Flags.isSExt() ? ISD::SEXTLOAD : ISD::ZEXTLOAD;
1568     ISD::LoadExtType Ext = ISD::SEXTLOAD;
1569     SDValue Arg = DAG.getExtLoad(Ext, DL, VT, Chain,
1570                                  DAG.getConstant(36 + VA.getLocMemOffset(), MVT::i32),
1571                                  MachinePointerInfo(UndefValue::get(PtrTy)),
1572                                  MemVT, false, false, 4);
1573
1574     // 4 is the preferred alignment for the CONSTANT memory space.
1575     InVals.push_back(Arg);
1576   }
1577   return Chain;
1578 }
1579
1580 EVT R600TargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const {
1581    if (!VT.isVector())
1582      return MVT::i32;
1583    return VT.changeVectorElementTypeToInteger();
1584 }
1585
1586 static SDValue CompactSwizzlableVector(
1587   SelectionDAG &DAG, SDValue VectorEntry,
1588   DenseMap<unsigned, unsigned> &RemapSwizzle) {
1589   assert(VectorEntry.getOpcode() == ISD::BUILD_VECTOR);
1590   assert(RemapSwizzle.empty());
1591   SDValue NewBldVec[4] = {
1592     VectorEntry.getOperand(0),
1593     VectorEntry.getOperand(1),
1594     VectorEntry.getOperand(2),
1595     VectorEntry.getOperand(3)
1596   };
1597
1598   for (unsigned i = 0; i < 4; i++) {
1599     if (NewBldVec[i].getOpcode() == ISD::UNDEF)
1600       // We mask write here to teach later passes that the ith element of this
1601       // vector is undef. Thus we can use it to reduce 128 bits reg usage,
1602       // break false dependencies and additionnaly make assembly easier to read.
1603       RemapSwizzle[i] = 7; // SEL_MASK_WRITE
1604     if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(NewBldVec[i])) {
1605       if (C->isZero()) {
1606         RemapSwizzle[i] = 4; // SEL_0
1607         NewBldVec[i] = DAG.getUNDEF(MVT::f32);
1608       } else if (C->isExactlyValue(1.0)) {
1609         RemapSwizzle[i] = 5; // SEL_1
1610         NewBldVec[i] = DAG.getUNDEF(MVT::f32);
1611       }
1612     }
1613
1614     if (NewBldVec[i].getOpcode() == ISD::UNDEF)
1615       continue;
1616     for (unsigned j = 0; j < i; j++) {
1617       if (NewBldVec[i] == NewBldVec[j]) {
1618         NewBldVec[i] = DAG.getUNDEF(NewBldVec[i].getValueType());
1619         RemapSwizzle[i] = j;
1620         break;
1621       }
1622     }
1623   }
1624
1625   return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(VectorEntry),
1626                      VectorEntry.getValueType(), NewBldVec);
1627 }
1628
1629 static SDValue ReorganizeVector(SelectionDAG &DAG, SDValue VectorEntry,
1630                                 DenseMap<unsigned, unsigned> &RemapSwizzle) {
1631   assert(VectorEntry.getOpcode() == ISD::BUILD_VECTOR);
1632   assert(RemapSwizzle.empty());
1633   SDValue NewBldVec[4] = {
1634       VectorEntry.getOperand(0),
1635       VectorEntry.getOperand(1),
1636       VectorEntry.getOperand(2),
1637       VectorEntry.getOperand(3)
1638   };
1639   bool isUnmovable[4] = { false, false, false, false };
1640   for (unsigned i = 0; i < 4; i++) {
1641     RemapSwizzle[i] = i;
1642     if (NewBldVec[i].getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
1643       unsigned Idx = dyn_cast<ConstantSDNode>(NewBldVec[i].getOperand(1))
1644           ->getZExtValue();
1645       if (i == Idx)
1646         isUnmovable[Idx] = true;
1647     }
1648   }
1649
1650   for (unsigned i = 0; i < 4; i++) {
1651     if (NewBldVec[i].getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
1652       unsigned Idx = dyn_cast<ConstantSDNode>(NewBldVec[i].getOperand(1))
1653           ->getZExtValue();
1654       if (isUnmovable[Idx])
1655         continue;
1656       // Swap i and Idx
1657       std::swap(NewBldVec[Idx], NewBldVec[i]);
1658       std::swap(RemapSwizzle[i], RemapSwizzle[Idx]);
1659       break;
1660     }
1661   }
1662
1663   return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(VectorEntry),
1664                      VectorEntry.getValueType(), NewBldVec);
1665 }
1666
1667
1668 SDValue R600TargetLowering::OptimizeSwizzle(SDValue BuildVector,
1669 SDValue Swz[4], SelectionDAG &DAG) const {
1670   assert(BuildVector.getOpcode() == ISD::BUILD_VECTOR);
1671   // Old -> New swizzle values
1672   DenseMap<unsigned, unsigned> SwizzleRemap;
1673
1674   BuildVector = CompactSwizzlableVector(DAG, BuildVector, SwizzleRemap);
1675   for (unsigned i = 0; i < 4; i++) {
1676     unsigned Idx = dyn_cast<ConstantSDNode>(Swz[i])->getZExtValue();
1677     if (SwizzleRemap.find(Idx) != SwizzleRemap.end())
1678       Swz[i] = DAG.getConstant(SwizzleRemap[Idx], MVT::i32);
1679   }
1680
1681   SwizzleRemap.clear();
1682   BuildVector = ReorganizeVector(DAG, BuildVector, SwizzleRemap);
1683   for (unsigned i = 0; i < 4; i++) {
1684     unsigned Idx = dyn_cast<ConstantSDNode>(Swz[i])->getZExtValue();
1685     if (SwizzleRemap.find(Idx) != SwizzleRemap.end())
1686       Swz[i] = DAG.getConstant(SwizzleRemap[Idx], MVT::i32);
1687   }
1688
1689   return BuildVector;
1690 }
1691
1692
1693 //===----------------------------------------------------------------------===//
1694 // Custom DAG Optimizations
1695 //===----------------------------------------------------------------------===//
1696
1697 SDValue R600TargetLowering::PerformDAGCombine(SDNode *N,
1698                                               DAGCombinerInfo &DCI) const {
1699   SelectionDAG &DAG = DCI.DAG;
1700
1701   switch (N->getOpcode()) {
1702   default: return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
1703   // (f32 fp_round (f64 uint_to_fp a)) -> (f32 uint_to_fp a)
1704   case ISD::FP_ROUND: {
1705       SDValue Arg = N->getOperand(0);
1706       if (Arg.getOpcode() == ISD::UINT_TO_FP && Arg.getValueType() == MVT::f64) {
1707         return DAG.getNode(ISD::UINT_TO_FP, SDLoc(N), N->getValueType(0),
1708                            Arg.getOperand(0));
1709       }
1710       break;
1711     }
1712
1713   // (i32 fp_to_sint (fneg (select_cc f32, f32, 1.0, 0.0 cc))) ->
1714   // (i32 select_cc f32, f32, -1, 0 cc)
1715   //
1716   // Mesa's GLSL frontend generates the above pattern a lot and we can lower
1717   // this to one of the SET*_DX10 instructions.
1718   case ISD::FP_TO_SINT: {
1719     SDValue FNeg = N->getOperand(0);
1720     if (FNeg.getOpcode() != ISD::FNEG) {
1721       return SDValue();
1722     }
1723     SDValue SelectCC = FNeg.getOperand(0);
1724     if (SelectCC.getOpcode() != ISD::SELECT_CC ||
1725         SelectCC.getOperand(0).getValueType() != MVT::f32 || // LHS
1726         SelectCC.getOperand(2).getValueType() != MVT::f32 || // True
1727         !isHWTrueValue(SelectCC.getOperand(2)) ||
1728         !isHWFalseValue(SelectCC.getOperand(3))) {
1729       return SDValue();
1730     }
1731
1732     return DAG.getNode(ISD::SELECT_CC, SDLoc(N), N->getValueType(0),
1733                            SelectCC.getOperand(0), // LHS
1734                            SelectCC.getOperand(1), // RHS
1735                            DAG.getConstant(-1, MVT::i32), // True
1736                            DAG.getConstant(0, MVT::i32),  // Flase
1737                            SelectCC.getOperand(4)); // CC
1738
1739     break;
1740   }
1741
1742   // insert_vector_elt (build_vector elt0, ... , eltN), NewEltIdx, idx
1743   // => build_vector elt0, ... , NewEltIdx, ... , eltN
1744   case ISD::INSERT_VECTOR_ELT: {
1745     SDValue InVec = N->getOperand(0);
1746     SDValue InVal = N->getOperand(1);
1747     SDValue EltNo = N->getOperand(2);
1748     SDLoc dl(N);
1749
1750     // If the inserted element is an UNDEF, just use the input vector.
1751     if (InVal.getOpcode() == ISD::UNDEF)
1752       return InVec;
1753
1754     EVT VT = InVec.getValueType();
1755
1756     // If we can't generate a legal BUILD_VECTOR, exit
1757     if (!isOperationLegal(ISD::BUILD_VECTOR, VT))
1758       return SDValue();
1759
1760     // Check that we know which element is being inserted
1761     if (!isa<ConstantSDNode>(EltNo))
1762       return SDValue();
1763     unsigned Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
1764
1765     // Check that the operand is a BUILD_VECTOR (or UNDEF, which can essentially
1766     // be converted to a BUILD_VECTOR).  Fill in the Ops vector with the
1767     // vector elements.
1768     SmallVector<SDValue, 8> Ops;
1769     if (InVec.getOpcode() == ISD::BUILD_VECTOR) {
1770       Ops.append(InVec.getNode()->op_begin(),
1771                  InVec.getNode()->op_end());
1772     } else if (InVec.getOpcode() == ISD::UNDEF) {
1773       unsigned NElts = VT.getVectorNumElements();
1774       Ops.append(NElts, DAG.getUNDEF(InVal.getValueType()));
1775     } else {
1776       return SDValue();
1777     }
1778
1779     // Insert the element
1780     if (Elt < Ops.size()) {
1781       // All the operands of BUILD_VECTOR must have the same type;
1782       // we enforce that here.
1783       EVT OpVT = Ops[0].getValueType();
1784       if (InVal.getValueType() != OpVT)
1785         InVal = OpVT.bitsGT(InVal.getValueType()) ?
1786           DAG.getNode(ISD::ANY_EXTEND, dl, OpVT, InVal) :
1787           DAG.getNode(ISD::TRUNCATE, dl, OpVT, InVal);
1788       Ops[Elt] = InVal;
1789     }
1790
1791     // Return the new vector
1792     return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
1793   }
1794
1795   // Extract_vec (Build_vector) generated by custom lowering
1796   // also needs to be customly combined
1797   case ISD::EXTRACT_VECTOR_ELT: {
1798     SDValue Arg = N->getOperand(0);
1799     if (Arg.getOpcode() == ISD::BUILD_VECTOR) {
1800       if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
1801         unsigned Element = Const->getZExtValue();
1802         return Arg->getOperand(Element);
1803       }
1804     }
1805     if (Arg.getOpcode() == ISD::BITCAST &&
1806         Arg.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
1807       if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
1808         unsigned Element = Const->getZExtValue();
1809         return DAG.getNode(ISD::BITCAST, SDLoc(N), N->getVTList(),
1810             Arg->getOperand(0).getOperand(Element));
1811       }
1812     }
1813   }
1814
1815   case ISD::SELECT_CC: {
1816     // Try common optimizations
1817     SDValue Ret = AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
1818     if (Ret.getNode())
1819       return Ret;
1820
1821     // fold selectcc (selectcc x, y, a, b, cc), b, a, b, seteq ->
1822     //      selectcc x, y, a, b, inv(cc)
1823     //
1824     // fold selectcc (selectcc x, y, a, b, cc), b, a, b, setne ->
1825     //      selectcc x, y, a, b, cc
1826     SDValue LHS = N->getOperand(0);
1827     if (LHS.getOpcode() != ISD::SELECT_CC) {
1828       return SDValue();
1829     }
1830
1831     SDValue RHS = N->getOperand(1);
1832     SDValue True = N->getOperand(2);
1833     SDValue False = N->getOperand(3);
1834     ISD::CondCode NCC = cast<CondCodeSDNode>(N->getOperand(4))->get();
1835
1836     if (LHS.getOperand(2).getNode() != True.getNode() ||
1837         LHS.getOperand(3).getNode() != False.getNode() ||
1838         RHS.getNode() != False.getNode()) {
1839       return SDValue();
1840     }
1841
1842     switch (NCC) {
1843     default: return SDValue();
1844     case ISD::SETNE: return LHS;
1845     case ISD::SETEQ: {
1846       ISD::CondCode LHSCC = cast<CondCodeSDNode>(LHS.getOperand(4))->get();
1847       LHSCC = ISD::getSetCCInverse(LHSCC,
1848                                   LHS.getOperand(0).getValueType().isInteger());
1849       if (DCI.isBeforeLegalizeOps() ||
1850           isCondCodeLegal(LHSCC, LHS.getOperand(0).getSimpleValueType()))
1851         return DAG.getSelectCC(SDLoc(N),
1852                                LHS.getOperand(0),
1853                                LHS.getOperand(1),
1854                                LHS.getOperand(2),
1855                                LHS.getOperand(3),
1856                                LHSCC);
1857       break;
1858     }
1859     }
1860     return SDValue();
1861   }
1862
1863   case AMDGPUISD::EXPORT: {
1864     SDValue Arg = N->getOperand(1);
1865     if (Arg.getOpcode() != ISD::BUILD_VECTOR)
1866       break;
1867
1868     SDValue NewArgs[8] = {
1869       N->getOperand(0), // Chain
1870       SDValue(),
1871       N->getOperand(2), // ArrayBase
1872       N->getOperand(3), // Type
1873       N->getOperand(4), // SWZ_X
1874       N->getOperand(5), // SWZ_Y
1875       N->getOperand(6), // SWZ_Z
1876       N->getOperand(7) // SWZ_W
1877     };
1878     SDLoc DL(N);
1879     NewArgs[1] = OptimizeSwizzle(N->getOperand(1), &NewArgs[4], DAG);
1880     return DAG.getNode(AMDGPUISD::EXPORT, DL, N->getVTList(), NewArgs);
1881   }
1882   case AMDGPUISD::TEXTURE_FETCH: {
1883     SDValue Arg = N->getOperand(1);
1884     if (Arg.getOpcode() != ISD::BUILD_VECTOR)
1885       break;
1886
1887     SDValue NewArgs[19] = {
1888       N->getOperand(0),
1889       N->getOperand(1),
1890       N->getOperand(2),
1891       N->getOperand(3),
1892       N->getOperand(4),
1893       N->getOperand(5),
1894       N->getOperand(6),
1895       N->getOperand(7),
1896       N->getOperand(8),
1897       N->getOperand(9),
1898       N->getOperand(10),
1899       N->getOperand(11),
1900       N->getOperand(12),
1901       N->getOperand(13),
1902       N->getOperand(14),
1903       N->getOperand(15),
1904       N->getOperand(16),
1905       N->getOperand(17),
1906       N->getOperand(18),
1907     };
1908     NewArgs[1] = OptimizeSwizzle(N->getOperand(1), &NewArgs[2], DAG);
1909     return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, SDLoc(N), N->getVTList(),
1910         NewArgs);
1911   }
1912   }
1913
1914   return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
1915 }
1916
1917 static bool
1918 FoldOperand(SDNode *ParentNode, unsigned SrcIdx, SDValue &Src, SDValue &Neg,
1919             SDValue &Abs, SDValue &Sel, SDValue &Imm, SelectionDAG &DAG) {
1920   const R600InstrInfo *TII =
1921       static_cast<const R600InstrInfo *>(DAG.getTarget().getInstrInfo());
1922   if (!Src.isMachineOpcode())
1923     return false;
1924   switch (Src.getMachineOpcode()) {
1925   case AMDGPU::FNEG_R600:
1926     if (!Neg.getNode())
1927       return false;
1928     Src = Src.getOperand(0);
1929     Neg = DAG.getTargetConstant(1, MVT::i32);
1930     return true;
1931   case AMDGPU::FABS_R600:
1932     if (!Abs.getNode())
1933       return false;
1934     Src = Src.getOperand(0);
1935     Abs = DAG.getTargetConstant(1, MVT::i32);
1936     return true;
1937   case AMDGPU::CONST_COPY: {
1938     unsigned Opcode = ParentNode->getMachineOpcode();
1939     bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1;
1940
1941     if (!Sel.getNode())
1942       return false;
1943
1944     SDValue CstOffset = Src.getOperand(0);
1945     if (ParentNode->getValueType(0).isVector())
1946       return false;
1947
1948     // Gather constants values
1949     int SrcIndices[] = {
1950       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0),
1951       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1),
1952       TII->getOperandIdx(Opcode, AMDGPU::OpName::src2),
1953       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_X),
1954       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Y),
1955       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Z),
1956       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_W),
1957       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_X),
1958       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Y),
1959       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Z),
1960       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_W)
1961     };
1962     std::vector<unsigned> Consts;
1963     for (int OtherSrcIdx : SrcIndices) {
1964       int OtherSelIdx = TII->getSelIdx(Opcode, OtherSrcIdx);
1965       if (OtherSrcIdx < 0 || OtherSelIdx < 0)
1966         continue;
1967       if (HasDst) {
1968         OtherSrcIdx--;
1969         OtherSelIdx--;
1970       }
1971       if (RegisterSDNode *Reg =
1972           dyn_cast<RegisterSDNode>(ParentNode->getOperand(OtherSrcIdx))) {
1973         if (Reg->getReg() == AMDGPU::ALU_CONST) {
1974           ConstantSDNode *Cst
1975             = cast<ConstantSDNode>(ParentNode->getOperand(OtherSelIdx));
1976           Consts.push_back(Cst->getZExtValue());
1977         }
1978       }
1979     }
1980
1981     ConstantSDNode *Cst = cast<ConstantSDNode>(CstOffset);
1982     Consts.push_back(Cst->getZExtValue());
1983     if (!TII->fitsConstReadLimitations(Consts)) {
1984       return false;
1985     }
1986
1987     Sel = CstOffset;
1988     Src = DAG.getRegister(AMDGPU::ALU_CONST, MVT::f32);
1989     return true;
1990   }
1991   case AMDGPU::MOV_IMM_I32:
1992   case AMDGPU::MOV_IMM_F32: {
1993     unsigned ImmReg = AMDGPU::ALU_LITERAL_X;
1994     uint64_t ImmValue = 0;
1995
1996
1997     if (Src.getMachineOpcode() == AMDGPU::MOV_IMM_F32) {
1998       ConstantFPSDNode *FPC = dyn_cast<ConstantFPSDNode>(Src.getOperand(0));
1999       float FloatValue = FPC->getValueAPF().convertToFloat();
2000       if (FloatValue == 0.0) {
2001         ImmReg = AMDGPU::ZERO;
2002       } else if (FloatValue == 0.5) {
2003         ImmReg = AMDGPU::HALF;
2004       } else if (FloatValue == 1.0) {
2005         ImmReg = AMDGPU::ONE;
2006       } else {
2007         ImmValue = FPC->getValueAPF().bitcastToAPInt().getZExtValue();
2008       }
2009     } else {
2010       ConstantSDNode *C = dyn_cast<ConstantSDNode>(Src.getOperand(0));
2011       uint64_t Value = C->getZExtValue();
2012       if (Value == 0) {
2013         ImmReg = AMDGPU::ZERO;
2014       } else if (Value == 1) {
2015         ImmReg = AMDGPU::ONE_INT;
2016       } else {
2017         ImmValue = Value;
2018       }
2019     }
2020
2021     // Check that we aren't already using an immediate.
2022     // XXX: It's possible for an instruction to have more than one
2023     // immediate operand, but this is not supported yet.
2024     if (ImmReg == AMDGPU::ALU_LITERAL_X) {
2025       if (!Imm.getNode())
2026         return false;
2027       ConstantSDNode *C = dyn_cast<ConstantSDNode>(Imm);
2028       assert(C);
2029       if (C->getZExtValue())
2030         return false;
2031       Imm = DAG.getTargetConstant(ImmValue, MVT::i32);
2032     }
2033     Src = DAG.getRegister(ImmReg, MVT::i32);
2034     return true;
2035   }
2036   default:
2037     return false;
2038   }
2039 }
2040
2041
2042 /// \brief Fold the instructions after selecting them
2043 SDNode *R600TargetLowering::PostISelFolding(MachineSDNode *Node,
2044                                             SelectionDAG &DAG) const {
2045   const R600InstrInfo *TII =
2046       static_cast<const R600InstrInfo *>(DAG.getTarget().getInstrInfo());
2047   if (!Node->isMachineOpcode())
2048     return Node;
2049   unsigned Opcode = Node->getMachineOpcode();
2050   SDValue FakeOp;
2051
2052   std::vector<SDValue> Ops;
2053   for(SDNode::op_iterator I = Node->op_begin(), E = Node->op_end();
2054               I != E; ++I)
2055           Ops.push_back(*I);
2056
2057   if (Opcode == AMDGPU::DOT_4) {
2058     int OperandIdx[] = {
2059       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_X),
2060       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Y),
2061       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Z),
2062       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_W),
2063       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_X),
2064       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Y),
2065       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Z),
2066       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_W)
2067         };
2068     int NegIdx[] = {
2069       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_X),
2070       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_Y),
2071       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_Z),
2072       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_W),
2073       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_X),
2074       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_Y),
2075       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_Z),
2076       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_W)
2077     };
2078     int AbsIdx[] = {
2079       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_X),
2080       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_Y),
2081       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_Z),
2082       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_W),
2083       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_X),
2084       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_Y),
2085       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_Z),
2086       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_W)
2087     };
2088     for (unsigned i = 0; i < 8; i++) {
2089       if (OperandIdx[i] < 0)
2090         return Node;
2091       SDValue &Src = Ops[OperandIdx[i] - 1];
2092       SDValue &Neg = Ops[NegIdx[i] - 1];
2093       SDValue &Abs = Ops[AbsIdx[i] - 1];
2094       bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1;
2095       int SelIdx = TII->getSelIdx(Opcode, OperandIdx[i]);
2096       if (HasDst)
2097         SelIdx--;
2098       SDValue &Sel = (SelIdx > -1) ? Ops[SelIdx] : FakeOp;
2099       if (FoldOperand(Node, i, Src, Neg, Abs, Sel, FakeOp, DAG))
2100         return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
2101     }
2102   } else if (Opcode == AMDGPU::REG_SEQUENCE) {
2103     for (unsigned i = 1, e = Node->getNumOperands(); i < e; i += 2) {
2104       SDValue &Src = Ops[i];
2105       if (FoldOperand(Node, i, Src, FakeOp, FakeOp, FakeOp, FakeOp, DAG))
2106         return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
2107     }
2108   } else if (Opcode == AMDGPU::CLAMP_R600) {
2109     SDValue Src = Node->getOperand(0);
2110     if (!Src.isMachineOpcode() ||
2111         !TII->hasInstrModifiers(Src.getMachineOpcode()))
2112       return Node;
2113     int ClampIdx = TII->getOperandIdx(Src.getMachineOpcode(),
2114         AMDGPU::OpName::clamp);
2115     if (ClampIdx < 0)
2116       return Node;
2117     std::vector<SDValue> Ops;
2118     unsigned NumOp = Src.getNumOperands();
2119     for(unsigned i = 0; i < NumOp; ++i)
2120           Ops.push_back(Src.getOperand(i));
2121     Ops[ClampIdx - 1] = DAG.getTargetConstant(1, MVT::i32);
2122     return DAG.getMachineNode(Src.getMachineOpcode(), SDLoc(Node),
2123         Node->getVTList(), Ops);
2124   } else {
2125     if (!TII->hasInstrModifiers(Opcode))
2126       return Node;
2127     int OperandIdx[] = {
2128       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0),
2129       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1),
2130       TII->getOperandIdx(Opcode, AMDGPU::OpName::src2)
2131     };
2132     int NegIdx[] = {
2133       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg),
2134       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg),
2135       TII->getOperandIdx(Opcode, AMDGPU::OpName::src2_neg)
2136     };
2137     int AbsIdx[] = {
2138       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs),
2139       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs),
2140       -1
2141     };
2142     for (unsigned i = 0; i < 3; i++) {
2143       if (OperandIdx[i] < 0)
2144         return Node;
2145       SDValue &Src = Ops[OperandIdx[i] - 1];
2146       SDValue &Neg = Ops[NegIdx[i] - 1];
2147       SDValue FakeAbs;
2148       SDValue &Abs = (AbsIdx[i] > -1) ? Ops[AbsIdx[i] - 1] : FakeAbs;
2149       bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1;
2150       int SelIdx = TII->getSelIdx(Opcode, OperandIdx[i]);
2151       int ImmIdx = TII->getOperandIdx(Opcode, AMDGPU::OpName::literal);
2152       if (HasDst) {
2153         SelIdx--;
2154         ImmIdx--;
2155       }
2156       SDValue &Sel = (SelIdx > -1) ? Ops[SelIdx] : FakeOp;
2157       SDValue &Imm = Ops[ImmIdx];
2158       if (FoldOperand(Node, i, Src, Neg, Abs, Sel, Imm, DAG))
2159         return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
2160     }
2161   }
2162
2163   return Node;
2164 }