lib/Target/R600/R600ISelLowering.cpp

   1 //===-- R600ISelLowering.cpp - R600 DAG Lowering Implementation -----------===//
   2 //
   3 //                     The LLVM Compiler Infrastructure
   4 //
   5 // This file is distributed under the University of Illinois Open Source
   6 // License. See LICENSE.TXT for details.
   7 //
   8 //===----------------------------------------------------------------------===//
   9 //
  10 /// \file
  11 /// \brief Custom DAG lowering for R600
  12 //
  13 //===----------------------------------------------------------------------===//
  14
  15 #include "R600ISelLowering.h"
  16 #include "AMDILIntrinsicInfo.h"
  17 #include "AMDGPUFrameLowering.h"
  18 #include "AMDGPUSubtarget.h"
  19 #include "R600Defines.h"
  20 #include "R600InstrInfo.h"
  21 #include "R600MachineFunctionInfo.h"
  22 #include "llvm/CodeGen/CallingConvLower.h"
  23 #include "llvm/CodeGen/MachineFrameInfo.h"
  24 #include "llvm/CodeGen/MachineInstrBuilder.h"
  25 #include "llvm/CodeGen/MachineRegisterInfo.h"
  26 #include "llvm/CodeGen/SelectionDAG.h"
  27 #include "llvm/IR/Argument.h"
  28 #include "llvm/IR/Function.h"
  29
  30 using namespace llvm;
  31
  32 R600TargetLowering::R600TargetLowering(TargetMachine &TM) :
  33     AMDGPUTargetLowering(TM),
  34     Gen(TM.getSubtarget<AMDGPUSubtarget>().getGeneration()) {
  35   addRegisterClass(MVT::v4f32, &AMDGPU::R600_Reg128RegClass);
  36   addRegisterClass(MVT::f32, &AMDGPU::R600_Reg32RegClass);
  37   addRegisterClass(MVT::v4i32, &AMDGPU::R600_Reg128RegClass);
  38   addRegisterClass(MVT::i32, &AMDGPU::R600_Reg32RegClass);
  39   addRegisterClass(MVT::v2f32, &AMDGPU::R600_Reg64RegClass);
  40   addRegisterClass(MVT::v2i32, &AMDGPU::R600_Reg64RegClass);
  41
  42   computeRegisterProperties();
  43
  44   // Set condition code actions
  45   setCondCodeAction(ISD::SETO,   MVT::f32, Expand);
  46   setCondCodeAction(ISD::SETUO,  MVT::f32, Expand);
  47   setCondCodeAction(ISD::SETLT,  MVT::f32, Expand);
  48   setCondCodeAction(ISD::SETLE,  MVT::f32, Expand);
  49   setCondCodeAction(ISD::SETOLT, MVT::f32, Expand);
  50   setCondCodeAction(ISD::SETOLE, MVT::f32, Expand);
  51   setCondCodeAction(ISD::SETONE, MVT::f32, Expand);
  52   setCondCodeAction(ISD::SETUEQ, MVT::f32, Expand);
  53   setCondCodeAction(ISD::SETUGE, MVT::f32, Expand);
  54   setCondCodeAction(ISD::SETUGT, MVT::f32, Expand);
  55   setCondCodeAction(ISD::SETULT, MVT::f32, Expand);
  56   setCondCodeAction(ISD::SETULE, MVT::f32, Expand);
  57
  58   setCondCodeAction(ISD::SETLE, MVT::i32, Expand);
  59   setCondCodeAction(ISD::SETLT, MVT::i32, Expand);
  60   setCondCodeAction(ISD::SETULE, MVT::i32, Expand);
  61   setCondCodeAction(ISD::SETULT, MVT::i32, Expand);
  62
  63   setOperationAction(ISD::FCOS, MVT::f32, Custom);
  64   setOperationAction(ISD::FSIN, MVT::f32, Custom);
  65
  66   setOperationAction(ISD::SETCC, MVT::v4i32, Expand);
  67   setOperationAction(ISD::SETCC, MVT::v2i32, Expand);
  68
  69   setOperationAction(ISD::BR_CC, MVT::i32, Expand);
  70   setOperationAction(ISD::BR_CC, MVT::f32, Expand);
  71
  72   setOperationAction(ISD::FSUB, MVT::f32, Expand);
  73
  74   setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
  75   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
  76   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i1, Custom);
  77
  78   setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
  79   setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
  80
  81   setOperationAction(ISD::SETCC, MVT::i32, Expand);
  82   setOperationAction(ISD::SETCC, MVT::f32, Expand);
  83   setOperationAction(ISD::FP_TO_UINT, MVT::i1, Custom);
  84
  85   setOperationAction(ISD::SELECT, MVT::i32, Expand);
  86   setOperationAction(ISD::SELECT, MVT::f32, Expand);
  87   setOperationAction(ISD::SELECT, MVT::v2i32, Expand);
  88   setOperationAction(ISD::SELECT, MVT::v4i32, Expand);
  89
  90   // Expand sign extension of vectors
  91   if (!Subtarget->hasBFE())
  92     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
  93
  94   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i1, Expand);
  95   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i1, Expand);
  96
  97   if (!Subtarget->hasBFE())
  98     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8, Expand);
  99   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i8, Expand);
 100   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i8, Expand);
 101
 102   if (!Subtarget->hasBFE())
 103     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Expand);
 104   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Expand);
 105   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Expand);
 106
 107   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
 108   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i32, Expand);
 109   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i32, Expand);
 110
 111   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::Other, Expand);
 112
 113
 114   // Legalize loads and stores to the private address space.
 115   setOperationAction(ISD::LOAD, MVT::i32, Custom);
 116   setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
 117   setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
 118
 119   // EXTLOAD should be the same as ZEXTLOAD. It is legal for some address
 120   // spaces, so it is custom lowered to handle those where it isn't.
 121   setLoadExtAction(ISD::SEXTLOAD, MVT::i8, Custom);
 122   setLoadExtAction(ISD::SEXTLOAD, MVT::i16, Custom);
 123   setLoadExtAction(ISD::ZEXTLOAD, MVT::i8, Custom);
 124   setLoadExtAction(ISD::ZEXTLOAD, MVT::i16, Custom);
 125   setLoadExtAction(ISD::EXTLOAD, MVT::i8, Custom);
 126   setLoadExtAction(ISD::EXTLOAD, MVT::i16, Custom);
 127
 128   setOperationAction(ISD::STORE, MVT::i8, Custom);
 129   setOperationAction(ISD::STORE, MVT::i32, Custom);
 130   setOperationAction(ISD::STORE, MVT::v2i32, Custom);
 131   setOperationAction(ISD::STORE, MVT::v4i32, Custom);
 132   setTruncStoreAction(MVT::i32, MVT::i8, Custom);
 133   setTruncStoreAction(MVT::i32, MVT::i16, Custom);
 134
 135   setOperationAction(ISD::LOAD, MVT::i32, Custom);
 136   setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
 137   setOperationAction(ISD::FrameIndex, MVT::i32, Custom);
 138
 139   setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i32, Custom);
 140   setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f32, Custom);
 141   setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom);
 142   setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
 143
 144   setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i32, Custom);
 145   setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f32, Custom);
 146   setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom);
 147   setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom);
 148
 149   setTargetDAGCombine(ISD::FP_ROUND);
 150   setTargetDAGCombine(ISD::FP_TO_SINT);
 151   setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
 152   setTargetDAGCombine(ISD::SELECT_CC);
 153   setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
 154
 155   // These should be replaced by UDVIREM, but it does not happen automatically
 156   // during Type Legalization
 157   setOperationAction(ISD::UDIV, MVT::i64, Custom);
 158   setOperationAction(ISD::UREM, MVT::i64, Custom);
 159
 160   // We don't have 64-bit shifts. Thus we need either SHX i64 or SHX_PARTS i32
 161   //  to be Legal/Custom in order to avoid library calls.
 162   setOperationAction(ISD::SHL_PARTS, MVT::i32, Custom);
 163   setOperationAction(ISD::SRL_PARTS, MVT::i32, Custom);
 164
 165   setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
 166
 167   setBooleanContents(ZeroOrNegativeOneBooleanContent);
 168   setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
 169   setSchedulingPreference(Sched::Source);
 170 }
 171
 172 MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter(
 173     MachineInstr * MI, MachineBasicBlock * BB) const {
 174   MachineFunction * MF = BB->getParent();
 175   MachineRegisterInfo &MRI = MF->getRegInfo();
 176   MachineBasicBlock::iterator I = *MI;
 177   const R600InstrInfo *TII =
 178     static_cast<const R600InstrInfo*>(MF->getTarget().getInstrInfo());
 179
 180   switch (MI->getOpcode()) {
 181   default:
 182     // Replace LDS_*_RET instruction that don't have any uses with the
 183     // equivalent LDS_*_NORET instruction.
 184     if (TII->isLDSRetInstr(MI->getOpcode())) {
 185       int DstIdx = TII->getOperandIdx(MI->getOpcode(), AMDGPU::OpName::dst);
 186       assert(DstIdx != -1);
 187       MachineInstrBuilder NewMI;
 188       if (!MRI.use_empty(MI->getOperand(DstIdx).getReg()))
 189         return BB;
 190
 191       NewMI = BuildMI(*BB, I, BB->findDebugLoc(I),
 192                       TII->get(AMDGPU::getLDSNoRetOp(MI->getOpcode())));
 193       for (unsigned i = 1, e = MI->getNumOperands(); i < e; ++i) {
 194         NewMI.addOperand(MI->getOperand(i));
 195       }
 196     } else {
 197       return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
 198     }
 199     break;
 200   case AMDGPU::CLAMP_R600: {
 201     MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
 202                                                    AMDGPU::MOV,
 203                                                    MI->getOperand(0).getReg(),
 204                                                    MI->getOperand(1).getReg());
 205     TII->addFlag(NewMI, 0, MO_FLAG_CLAMP);
 206     break;
 207   }
 208
 209   case AMDGPU::FABS_R600: {
 210     MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
 211                                                     AMDGPU::MOV,
 212                                                     MI->getOperand(0).getReg(),
 213                                                     MI->getOperand(1).getReg());
 214     TII->addFlag(NewMI, 0, MO_FLAG_ABS);
 215     break;
 216   }
 217
 218   case AMDGPU::FNEG_R600: {
 219     MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
 220                                                     AMDGPU::MOV,
 221                                                     MI->getOperand(0).getReg(),
 222                                                     MI->getOperand(1).getReg());
 223     TII->addFlag(NewMI, 0, MO_FLAG_NEG);
 224     break;
 225   }
 226
 227   case AMDGPU::MASK_WRITE: {
 228     unsigned maskedRegister = MI->getOperand(0).getReg();
 229     assert(TargetRegisterInfo::isVirtualRegister(maskedRegister));
 230     MachineInstr * defInstr = MRI.getVRegDef(maskedRegister);
 231     TII->addFlag(defInstr, 0, MO_FLAG_MASK);
 232     break;
 233   }
 234
 235   case AMDGPU::MOV_IMM_F32:
 236     TII->buildMovImm(*BB, I, MI->getOperand(0).getReg(),
 237                      MI->getOperand(1).getFPImm()->getValueAPF()
 238                          .bitcastToAPInt().getZExtValue());
 239     break;
 240   case AMDGPU::MOV_IMM_I32:
 241     TII->buildMovImm(*BB, I, MI->getOperand(0).getReg(),
 242                      MI->getOperand(1).getImm());
 243     break;
 244   case AMDGPU::CONST_COPY: {
 245     MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, MI, AMDGPU::MOV,
 246         MI->getOperand(0).getReg(), AMDGPU::ALU_CONST);
 247     TII->setImmOperand(NewMI, AMDGPU::OpName::src0_sel,
 248         MI->getOperand(1).getImm());
 249     break;
 250   }
 251
 252   case AMDGPU::RAT_WRITE_CACHELESS_32_eg:
 253   case AMDGPU::RAT_WRITE_CACHELESS_64_eg:
 254   case AMDGPU::RAT_WRITE_CACHELESS_128_eg: {
 255     unsigned EOP = (std::next(I)->getOpcode() == AMDGPU::RETURN) ? 1 : 0;
 256
 257     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode()))
 258             .addOperand(MI->getOperand(0))
 259             .addOperand(MI->getOperand(1))
 260             .addImm(EOP); // Set End of program bit
 261     break;
 262   }
 263
 264   case AMDGPU::TXD: {
 265     unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
 266     unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
 267     MachineOperand &RID = MI->getOperand(4);
 268     MachineOperand &SID = MI->getOperand(5);
 269     unsigned TextureId = MI->getOperand(6).getImm();
 270     unsigned SrcX = 0, SrcY = 1, SrcZ = 2, SrcW = 3;
 271     unsigned CTX = 1, CTY = 1, CTZ = 1, CTW = 1;
 272
 273     switch (TextureId) {
 274     case 5: // Rect
 275       CTX = CTY = 0;
 276       break;
 277     case 6: // Shadow1D
 278       SrcW = SrcZ;
 279       break;
 280     case 7: // Shadow2D
 281       SrcW = SrcZ;
 282       break;
 283     case 8: // ShadowRect
 284       CTX = CTY = 0;
 285       SrcW = SrcZ;
 286       break;
 287     case 9: // 1DArray
 288       SrcZ = SrcY;
 289       CTZ = 0;
 290       break;
 291     case 10: // 2DArray
 292       CTZ = 0;
 293       break;
 294     case 11: // Shadow1DArray
 295       SrcZ = SrcY;
 296       CTZ = 0;
 297       break;
 298     case 12: // Shadow2DArray
 299       CTZ = 0;
 300       break;
 301     }
 302     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), T0)
 303             .addOperand(MI->getOperand(3))
 304             .addImm(SrcX)
 305             .addImm(SrcY)
 306             .addImm(SrcZ)
 307             .addImm(SrcW)
 308             .addImm(0)
 309             .addImm(0)
 310             .addImm(0)
 311             .addImm(0)
 312             .addImm(1)
 313             .addImm(2)
 314             .addImm(3)
 315             .addOperand(RID)
 316             .addOperand(SID)
 317             .addImm(CTX)
 318             .addImm(CTY)
 319             .addImm(CTZ)
 320             .addImm(CTW);
 321     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), T1)
 322             .addOperand(MI->getOperand(2))
 323             .addImm(SrcX)
 324             .addImm(SrcY)
 325             .addImm(SrcZ)
 326             .addImm(SrcW)
 327             .addImm(0)
 328             .addImm(0)
 329             .addImm(0)
 330             .addImm(0)
 331             .addImm(1)
 332             .addImm(2)
 333             .addImm(3)
 334             .addOperand(RID)
 335             .addOperand(SID)
 336             .addImm(CTX)
 337             .addImm(CTY)
 338             .addImm(CTZ)
 339             .addImm(CTW);
 340     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_G))
 341             .addOperand(MI->getOperand(0))
 342             .addOperand(MI->getOperand(1))
 343             .addImm(SrcX)
 344             .addImm(SrcY)
 345             .addImm(SrcZ)
 346             .addImm(SrcW)
 347             .addImm(0)
 348             .addImm(0)
 349             .addImm(0)
 350             .addImm(0)
 351             .addImm(1)
 352             .addImm(2)
 353             .addImm(3)
 354             .addOperand(RID)
 355             .addOperand(SID)
 356             .addImm(CTX)
 357             .addImm(CTY)
 358             .addImm(CTZ)
 359             .addImm(CTW)
 360             .addReg(T0, RegState::Implicit)
 361             .addReg(T1, RegState::Implicit);
 362     break;
 363   }
 364
 365   case AMDGPU::TXD_SHADOW: {
 366     unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
 367     unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
 368     MachineOperand &RID = MI->getOperand(4);
 369     MachineOperand &SID = MI->getOperand(5);
 370     unsigned TextureId = MI->getOperand(6).getImm();
 371     unsigned SrcX = 0, SrcY = 1, SrcZ = 2, SrcW = 3;
 372     unsigned CTX = 1, CTY = 1, CTZ = 1, CTW = 1;
 373
 374     switch (TextureId) {
 375     case 5: // Rect
 376       CTX = CTY = 0;
 377       break;
 378     case 6: // Shadow1D
 379       SrcW = SrcZ;
 380       break;
 381     case 7: // Shadow2D
 382       SrcW = SrcZ;
 383       break;
 384     case 8: // ShadowRect
 385       CTX = CTY = 0;
 386       SrcW = SrcZ;
 387       break;
 388     case 9: // 1DArray
 389       SrcZ = SrcY;
 390       CTZ = 0;
 391       break;
 392     case 10: // 2DArray
 393       CTZ = 0;
 394       break;
 395     case 11: // Shadow1DArray
 396       SrcZ = SrcY;
 397       CTZ = 0;
 398       break;
 399     case 12: // Shadow2DArray
 400       CTZ = 0;
 401       break;
 402     }
 403
 404     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), T0)
 405             .addOperand(MI->getOperand(3))
 406             .addImm(SrcX)
 407             .addImm(SrcY)
 408             .addImm(SrcZ)
 409             .addImm(SrcW)
 410             .addImm(0)
 411             .addImm(0)
 412             .addImm(0)
 413             .addImm(0)
 414             .addImm(1)
 415             .addImm(2)
 416             .addImm(3)
 417             .addOperand(RID)
 418             .addOperand(SID)
 419             .addImm(CTX)
 420             .addImm(CTY)
 421             .addImm(CTZ)
 422             .addImm(CTW);
 423     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), T1)
 424             .addOperand(MI->getOperand(2))
 425             .addImm(SrcX)
 426             .addImm(SrcY)
 427             .addImm(SrcZ)
 428             .addImm(SrcW)
 429             .addImm(0)
 430             .addImm(0)
 431             .addImm(0)
 432             .addImm(0)
 433             .addImm(1)
 434             .addImm(2)
 435             .addImm(3)
 436             .addOperand(RID)
 437             .addOperand(SID)
 438             .addImm(CTX)
 439             .addImm(CTY)
 440             .addImm(CTZ)
 441             .addImm(CTW);
 442     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_C_G))
 443             .addOperand(MI->getOperand(0))
 444             .addOperand(MI->getOperand(1))
 445             .addImm(SrcX)
 446             .addImm(SrcY)
 447             .addImm(SrcZ)
 448             .addImm(SrcW)
 449             .addImm(0)
 450             .addImm(0)
 451             .addImm(0)
 452             .addImm(0)
 453             .addImm(1)
 454             .addImm(2)
 455             .addImm(3)
 456             .addOperand(RID)
 457             .addOperand(SID)
 458             .addImm(CTX)
 459             .addImm(CTY)
 460             .addImm(CTZ)
 461             .addImm(CTW)
 462             .addReg(T0, RegState::Implicit)
 463             .addReg(T1, RegState::Implicit);
 464     break;
 465   }
 466
 467   case AMDGPU::BRANCH:
 468       BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP))
 469               .addOperand(MI->getOperand(0));
 470       break;
 471
 472   case AMDGPU::BRANCH_COND_f32: {
 473     MachineInstr *NewMI =
 474       BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X),
 475               AMDGPU::PREDICATE_BIT)
 476               .addOperand(MI->getOperand(1))
 477               .addImm(OPCODE_IS_NOT_ZERO)
 478               .addImm(0); // Flags
 479     TII->addFlag(NewMI, 0, MO_FLAG_PUSH);
 480     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND))
 481             .addOperand(MI->getOperand(0))
 482             .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
 483     break;
 484   }
 485
 486   case AMDGPU::BRANCH_COND_i32: {
 487     MachineInstr *NewMI =
 488       BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X),
 489             AMDGPU::PREDICATE_BIT)
 490             .addOperand(MI->getOperand(1))
 491             .addImm(OPCODE_IS_NOT_ZERO_INT)
 492             .addImm(0); // Flags
 493     TII->addFlag(NewMI, 0, MO_FLAG_PUSH);
 494     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND))
 495            .addOperand(MI->getOperand(0))
 496             .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
 497     break;
 498   }
 499
 500   case AMDGPU::EG_ExportSwz:
 501   case AMDGPU::R600_ExportSwz: {
 502     // Instruction is left unmodified if its not the last one of its type
 503     bool isLastInstructionOfItsType = true;
 504     unsigned InstExportType = MI->getOperand(1).getImm();
 505     for (MachineBasicBlock::iterator NextExportInst = std::next(I),
 506          EndBlock = BB->end(); NextExportInst != EndBlock;
 507          NextExportInst = std::next(NextExportInst)) {
 508       if (NextExportInst->getOpcode() == AMDGPU::EG_ExportSwz ||
 509           NextExportInst->getOpcode() == AMDGPU::R600_ExportSwz) {
 510         unsigned CurrentInstExportType = NextExportInst->getOperand(1)
 511             .getImm();
 512         if (CurrentInstExportType == InstExportType) {
 513           isLastInstructionOfItsType = false;
 514           break;
 515         }
 516       }
 517     }
 518     bool EOP = (std::next(I)->getOpcode() == AMDGPU::RETURN) ? 1 : 0;
 519     if (!EOP && !isLastInstructionOfItsType)
 520       return BB;
 521     unsigned CfInst = (MI->getOpcode() == AMDGPU::EG_ExportSwz)? 84 : 40;
 522     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode()))
 523             .addOperand(MI->getOperand(0))
 524             .addOperand(MI->getOperand(1))
 525             .addOperand(MI->getOperand(2))
 526             .addOperand(MI->getOperand(3))
 527             .addOperand(MI->getOperand(4))
 528             .addOperand(MI->getOperand(5))
 529             .addOperand(MI->getOperand(6))
 530             .addImm(CfInst)
 531             .addImm(EOP);
 532     break;
 533   }
 534   case AMDGPU::RETURN: {
 535     // RETURN instructions must have the live-out registers as implicit uses,
 536     // otherwise they appear dead.
 537     R600MachineFunctionInfo *MFI = MF->getInfo<R600MachineFunctionInfo>();
 538     MachineInstrBuilder MIB(*MF, MI);
 539     for (unsigned i = 0, e = MFI->LiveOuts.size(); i != e; ++i)
 540       MIB.addReg(MFI->LiveOuts[i], RegState::Implicit);
 541     return BB;
 542   }
 543   }
 544
 545   MI->eraseFromParent();
 546   return BB;
 547 }
 548
 549 //===----------------------------------------------------------------------===//
 550 // Custom DAG Lowering Operations
 551 //===----------------------------------------------------------------------===//
 552
 553 SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
 554   MachineFunction &MF = DAG.getMachineFunction();
 555   R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
 556   switch (Op.getOpcode()) {
 557   default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
 558   case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
 559   case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
 560   case ISD::SHL_PARTS: return LowerSHLParts(Op, DAG);
 561   case ISD::SRL_PARTS: return LowerSRXParts(Op, DAG);
 562   case ISD::FCOS:
 563   case ISD::FSIN: return LowerTrig(Op, DAG);
 564   case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
 565   case ISD::STORE: return LowerSTORE(Op, DAG);
 566   case ISD::LOAD: return LowerLOAD(Op, DAG);
 567   case ISD::GlobalAddress: return LowerGlobalAddress(MFI, Op, DAG);
 568   case ISD::INTRINSIC_VOID: {
 569     SDValue Chain = Op.getOperand(0);
 570     unsigned IntrinsicID =
 571                          cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
 572     switch (IntrinsicID) {
 573     case AMDGPUIntrinsic::AMDGPU_store_output: {
 574       int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
 575       unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister(RegIndex);
 576       MFI->LiveOuts.push_back(Reg);
 577       return DAG.getCopyToReg(Chain, SDLoc(Op), Reg, Op.getOperand(2));
 578     }
 579     case AMDGPUIntrinsic::R600_store_swizzle: {
 580       const SDValue Args[8] = {
 581         Chain,
 582         Op.getOperand(2), // Export Value
 583         Op.getOperand(3), // ArrayBase
 584         Op.getOperand(4), // Type
 585         DAG.getConstant(0, MVT::i32), // SWZ_X
 586         DAG.getConstant(1, MVT::i32), // SWZ_Y
 587         DAG.getConstant(2, MVT::i32), // SWZ_Z
 588         DAG.getConstant(3, MVT::i32) // SWZ_W
 589       };
 590       return DAG.getNode(AMDGPUISD::EXPORT, SDLoc(Op), Op.getValueType(), Args);
 591     }
 592
 593     // default for switch(IntrinsicID)
 594     default: break;
 595     }
 596     // break out of case ISD::INTRINSIC_VOID in switch(Op.getOpcode())
 597     break;
 598   }
 599   case ISD::INTRINSIC_WO_CHAIN: {
 600     unsigned IntrinsicID =
 601                          cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
 602     EVT VT = Op.getValueType();
 603     SDLoc DL(Op);
 604     switch(IntrinsicID) {
 605     default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
 606     case AMDGPUIntrinsic::R600_load_input: {
 607       int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
 608       unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister(RegIndex);
 609       MachineFunction &MF = DAG.getMachineFunction();
 610       MachineRegisterInfo &MRI = MF.getRegInfo();
 611       MRI.addLiveIn(Reg);
 612       return DAG.getCopyFromReg(DAG.getEntryNode(),
 613           SDLoc(DAG.getEntryNode()), Reg, VT);
 614     }
 615
 616     case AMDGPUIntrinsic::R600_interp_input: {
 617       int slot = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
 618       int ijb = cast<ConstantSDNode>(Op.getOperand(2))->getSExtValue();
 619       MachineSDNode *interp;
 620       if (ijb < 0) {
 621         const MachineFunction &MF = DAG.getMachineFunction();
 622         const R600InstrInfo *TII =
 623           static_cast<const R600InstrInfo*>(MF.getTarget().getInstrInfo());
 624         interp = DAG.getMachineNode(AMDGPU::INTERP_VEC_LOAD, DL,
 625             MVT::v4f32, DAG.getTargetConstant(slot / 4 , MVT::i32));
 626         return DAG.getTargetExtractSubreg(
 627             TII->getRegisterInfo().getSubRegFromChannel(slot % 4),
 628             DL, MVT::f32, SDValue(interp, 0));
 629       }
 630       MachineFunction &MF = DAG.getMachineFunction();
 631       MachineRegisterInfo &MRI = MF.getRegInfo();
 632       unsigned RegisterI = AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb);
 633       unsigned RegisterJ = AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb + 1);
 634       MRI.addLiveIn(RegisterI);
 635       MRI.addLiveIn(RegisterJ);
 636       SDValue RegisterINode = DAG.getCopyFromReg(DAG.getEntryNode(),
 637           SDLoc(DAG.getEntryNode()), RegisterI, MVT::f32);
 638       SDValue RegisterJNode = DAG.getCopyFromReg(DAG.getEntryNode(),
 639           SDLoc(DAG.getEntryNode()), RegisterJ, MVT::f32);
 640
 641       if (slot % 4 < 2)
 642         interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_XY, DL,
 643             MVT::f32, MVT::f32, DAG.getTargetConstant(slot / 4 , MVT::i32),
 644             RegisterJNode, RegisterINode);
 645       else
 646         interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_ZW, DL,
 647             MVT::f32, MVT::f32, DAG.getTargetConstant(slot / 4 , MVT::i32),
 648             RegisterJNode, RegisterINode);
 649       return SDValue(interp, slot % 2);
 650     }
 651     case AMDGPUIntrinsic::R600_interp_xy:
 652     case AMDGPUIntrinsic::R600_interp_zw: {
 653       int slot = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
 654       MachineSDNode *interp;
 655       SDValue RegisterINode = Op.getOperand(2);
 656       SDValue RegisterJNode = Op.getOperand(3);
 657
 658       if (IntrinsicID == AMDGPUIntrinsic::R600_interp_xy)
 659         interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_XY, DL,
 660             MVT::f32, MVT::f32, DAG.getTargetConstant(slot, MVT::i32),
 661             RegisterJNode, RegisterINode);
 662       else
 663         interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_ZW, DL,
 664             MVT::f32, MVT::f32, DAG.getTargetConstant(slot, MVT::i32),
 665             RegisterJNode, RegisterINode);
 666       return DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2f32,
 667           SDValue(interp, 0), SDValue(interp, 1));
 668     }
 669     case AMDGPUIntrinsic::R600_tex:
 670     case AMDGPUIntrinsic::R600_texc:
 671     case AMDGPUIntrinsic::R600_txl:
 672     case AMDGPUIntrinsic::R600_txlc:
 673     case AMDGPUIntrinsic::R600_txb:
 674     case AMDGPUIntrinsic::R600_txbc:
 675     case AMDGPUIntrinsic::R600_txf:
 676     case AMDGPUIntrinsic::R600_txq:
 677     case AMDGPUIntrinsic::R600_ddx:
 678     case AMDGPUIntrinsic::R600_ddy:
 679     case AMDGPUIntrinsic::R600_ldptr: {
 680       unsigned TextureOp;
 681       switch (IntrinsicID) {
 682       case AMDGPUIntrinsic::R600_tex:
 683         TextureOp = 0;
 684         break;
 685       case AMDGPUIntrinsic::R600_texc:
 686         TextureOp = 1;
 687         break;
 688       case AMDGPUIntrinsic::R600_txl:
 689         TextureOp = 2;
 690         break;
 691       case AMDGPUIntrinsic::R600_txlc:
 692         TextureOp = 3;
 693         break;
 694       case AMDGPUIntrinsic::R600_txb:
 695         TextureOp = 4;
 696         break;
 697       case AMDGPUIntrinsic::R600_txbc:
 698         TextureOp = 5;
 699         break;
 700       case AMDGPUIntrinsic::R600_txf:
 701         TextureOp = 6;
 702         break;
 703       case AMDGPUIntrinsic::R600_txq:
 704         TextureOp = 7;
 705         break;
 706       case AMDGPUIntrinsic::R600_ddx:
 707         TextureOp = 8;
 708         break;
 709       case AMDGPUIntrinsic::R600_ddy:
 710         TextureOp = 9;
 711         break;
 712       case AMDGPUIntrinsic::R600_ldptr:
 713         TextureOp = 10;
 714         break;
 715       default:
 716         llvm_unreachable("Unknow Texture Operation");
 717       }
 718
 719       SDValue TexArgs[19] = {
 720         DAG.getConstant(TextureOp, MVT::i32),
 721         Op.getOperand(1),
 722         DAG.getConstant(0, MVT::i32),
 723         DAG.getConstant(1, MVT::i32),
 724         DAG.getConstant(2, MVT::i32),
 725         DAG.getConstant(3, MVT::i32),
 726         Op.getOperand(2),
 727         Op.getOperand(3),
 728         Op.getOperand(4),
 729         DAG.getConstant(0, MVT::i32),
 730         DAG.getConstant(1, MVT::i32),
 731         DAG.getConstant(2, MVT::i32),
 732         DAG.getConstant(3, MVT::i32),
 733         Op.getOperand(5),
 734         Op.getOperand(6),
 735         Op.getOperand(7),
 736         Op.getOperand(8),
 737         Op.getOperand(9),
 738         Op.getOperand(10)
 739       };
 740       return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, DL, MVT::v4f32, TexArgs);
 741     }
 742     case AMDGPUIntrinsic::AMDGPU_dp4: {
 743       SDValue Args[8] = {
 744       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
 745           DAG.getConstant(0, MVT::i32)),
 746       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
 747           DAG.getConstant(0, MVT::i32)),
 748       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
 749           DAG.getConstant(1, MVT::i32)),
 750       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
 751           DAG.getConstant(1, MVT::i32)),
 752       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
 753           DAG.getConstant(2, MVT::i32)),
 754       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
 755           DAG.getConstant(2, MVT::i32)),
 756       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
 757           DAG.getConstant(3, MVT::i32)),
 758       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
 759           DAG.getConstant(3, MVT::i32))
 760       };
 761       return DAG.getNode(AMDGPUISD::DOT4, DL, MVT::f32, Args);
 762     }
 763
 764     case Intrinsic::r600_read_ngroups_x:
 765       return LowerImplicitParameter(DAG, VT, DL, 0);
 766     case Intrinsic::r600_read_ngroups_y:
 767       return LowerImplicitParameter(DAG, VT, DL, 1);
 768     case Intrinsic::r600_read_ngroups_z:
 769       return LowerImplicitParameter(DAG, VT, DL, 2);
 770     case Intrinsic::r600_read_global_size_x:
 771       return LowerImplicitParameter(DAG, VT, DL, 3);
 772     case Intrinsic::r600_read_global_size_y:
 773       return LowerImplicitParameter(DAG, VT, DL, 4);
 774     case Intrinsic::r600_read_global_size_z:
 775       return LowerImplicitParameter(DAG, VT, DL, 5);
 776     case Intrinsic::r600_read_local_size_x:
 777       return LowerImplicitParameter(DAG, VT, DL, 6);
 778     case Intrinsic::r600_read_local_size_y:
 779       return LowerImplicitParameter(DAG, VT, DL, 7);
 780     case Intrinsic::r600_read_local_size_z:
 781       return LowerImplicitParameter(DAG, VT, DL, 8);
 782
 783     case Intrinsic::r600_read_tgid_x:
 784       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 785                                   AMDGPU::T1_X, VT);
 786     case Intrinsic::r600_read_tgid_y:
 787       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 788                                   AMDGPU::T1_Y, VT);
 789     case Intrinsic::r600_read_tgid_z:
 790       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 791                                   AMDGPU::T1_Z, VT);
 792     case Intrinsic::r600_read_tidig_x:
 793       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 794                                   AMDGPU::T0_X, VT);
 795     case Intrinsic::r600_read_tidig_y:
 796       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 797                                   AMDGPU::T0_Y, VT);
 798     case Intrinsic::r600_read_tidig_z:
 799       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 800                                   AMDGPU::T0_Z, VT);
 801     }
 802     // break out of case ISD::INTRINSIC_WO_CHAIN in switch(Op.getOpcode())
 803     break;
 804   }
 805   } // end switch(Op.getOpcode())
 806   return SDValue();
 807 }
 808
 809 void R600TargetLowering::ReplaceNodeResults(SDNode *N,
 810                                             SmallVectorImpl<SDValue> &Results,
 811                                             SelectionDAG &DAG) const {
 812   switch (N->getOpcode()) {
 813   default:
 814     AMDGPUTargetLowering::ReplaceNodeResults(N, Results, DAG);
 815     return;
 816   case ISD::FP_TO_UINT: Results.push_back(LowerFPTOUINT(N->getOperand(0), DAG));
 817     return;
 818   case ISD::LOAD: {
 819     SDNode *Node = LowerLOAD(SDValue(N, 0), DAG).getNode();
 820     Results.push_back(SDValue(Node, 0));
 821     Results.push_back(SDValue(Node, 1));
 822     // XXX: LLVM seems not to replace Chain Value inside CustomWidenLowerNode
 823     // function
 824     DAG.ReplaceAllUsesOfValueWith(SDValue(N,1), SDValue(Node, 1));
 825     return;
 826   }
 827   case ISD::STORE:
 828     SDNode *Node = LowerSTORE(SDValue(N, 0), DAG).getNode();
 829     Results.push_back(SDValue(Node, 0));
 830     return;
 831   }
 832 }
 833
 834 SDValue R600TargetLowering::vectorToVerticalVector(SelectionDAG &DAG,
 835                                                    SDValue Vector) const {
 836
 837   SDLoc DL(Vector);
 838   EVT VecVT = Vector.getValueType();
 839   EVT EltVT = VecVT.getVectorElementType();
 840   SmallVector<SDValue, 8> Args;
 841
 842   for (unsigned i = 0, e = VecVT.getVectorNumElements();
 843                                                            i != e; ++i) {
 844     Args.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT,
 845                                Vector, DAG.getConstant(i, getVectorIdxTy())));
 846   }
 847
 848   return DAG.getNode(AMDGPUISD::BUILD_VERTICAL_VECTOR, DL, VecVT, Args);
 849 }
 850
 851 SDValue R600TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
 852                                                     SelectionDAG &DAG) const {
 853
 854   SDLoc DL(Op);
 855   SDValue Vector = Op.getOperand(0);
 856   SDValue Index = Op.getOperand(1);
 857
 858   if (isa<ConstantSDNode>(Index) ||
 859       Vector.getOpcode() == AMDGPUISD::BUILD_VERTICAL_VECTOR)
 860     return Op;
 861
 862   Vector = vectorToVerticalVector(DAG, Vector);
 863   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Op.getValueType(),
 864                      Vector, Index);
 865 }
 866
 867 SDValue R600TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
 868                                                    SelectionDAG &DAG) const {
 869   SDLoc DL(Op);
 870   SDValue Vector = Op.getOperand(0);
 871   SDValue Value = Op.getOperand(1);
 872   SDValue Index = Op.getOperand(2);
 873
 874   if (isa<ConstantSDNode>(Index) ||
 875       Vector.getOpcode() == AMDGPUISD::BUILD_VERTICAL_VECTOR)
 876     return Op;
 877
 878   Vector = vectorToVerticalVector(DAG, Vector);
 879   SDValue Insert = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, Op.getValueType(),
 880                                Vector, Value, Index);
 881   return vectorToVerticalVector(DAG, Insert);
 882 }
 883
 884 SDValue R600TargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
 885   // On hw >= R700, COS/SIN input must be between -1. and 1.
 886   // Thus we lower them to TRIG ( FRACT ( x / 2Pi + 0.5) - 0.5)
 887   EVT VT = Op.getValueType();
 888   SDValue Arg = Op.getOperand(0);
 889   SDValue FractPart = DAG.getNode(AMDGPUISD::FRACT, SDLoc(Op), VT,
 890       DAG.getNode(ISD::FADD, SDLoc(Op), VT,
 891         DAG.getNode(ISD::FMUL, SDLoc(Op), VT, Arg,
 892           DAG.getConstantFP(0.15915494309, MVT::f32)),
 893         DAG.getConstantFP(0.5, MVT::f32)));
 894   unsigned TrigNode;
 895   switch (Op.getOpcode()) {
 896   case ISD::FCOS:
 897     TrigNode = AMDGPUISD::COS_HW;
 898     break;
 899   case ISD::FSIN:
 900     TrigNode = AMDGPUISD::SIN_HW;
 901     break;
 902   default:
 903     llvm_unreachable("Wrong trig opcode");
 904   }
 905   SDValue TrigVal = DAG.getNode(TrigNode, SDLoc(Op), VT,
 906       DAG.getNode(ISD::FADD, SDLoc(Op), VT, FractPart,
 907         DAG.getConstantFP(-0.5, MVT::f32)));
 908   if (Gen >= AMDGPUSubtarget::R700)
 909     return TrigVal;
 910   // On R600 hw, COS/SIN input must be between -Pi and Pi.
 911   return DAG.getNode(ISD::FMUL, SDLoc(Op), VT, TrigVal,
 912       DAG.getConstantFP(3.14159265359, MVT::f32));
 913 }
 914
 915 SDValue R600TargetLowering::LowerSHLParts(SDValue Op, SelectionDAG &DAG) const {
 916   SDLoc DL(Op);
 917   EVT VT = Op.getValueType();
 918
 919   SDValue Lo = Op.getOperand(0);
 920   SDValue Hi = Op.getOperand(1);
 921   SDValue Shift = Op.getOperand(2);
 922   SDValue Zero = DAG.getConstant(0, VT);
 923   SDValue One  = DAG.getConstant(1, VT);
 924
 925   SDValue Width  = DAG.getConstant(VT.getSizeInBits(), VT);
 926   SDValue Width1 = DAG.getConstant(VT.getSizeInBits() - 1, VT);
 927   SDValue BigShift  = DAG.getNode(ISD::SUB, DL, VT, Shift, Width);
 928   SDValue CompShift = DAG.getNode(ISD::SUB, DL, VT, Width1, Shift);
 929
 930   // The dance around Width1 is necessary for 0 special case.
 931   // Without it the CompShift might be 32, producing incorrect results in
 932   // Overflow. So we do the shift in two steps, the alternative is to
 933   // add a conditional to filter the special case.
 934
 935   SDValue Overflow = DAG.getNode(ISD::SRL, DL, VT, Lo, CompShift);
 936   Overflow = DAG.getNode(ISD::SRL, DL, VT, Overflow, One);
 937
 938   SDValue HiSmall = DAG.getNode(ISD::SHL, DL, VT, Hi, Shift);
 939   HiSmall = DAG.getNode(ISD::OR, DL, VT, HiSmall, Overflow);
 940   SDValue LoSmall = DAG.getNode(ISD::SHL, DL, VT, Lo, Shift);
 941
 942   SDValue HiBig = DAG.getNode(ISD::SHL, DL, VT, Lo, BigShift);
 943   SDValue LoBig = Zero;
 944
 945   Hi = DAG.getSelectCC(DL, Shift, Width, HiSmall, HiBig, ISD::SETULT);
 946   Lo = DAG.getSelectCC(DL, Shift, Width, LoSmall, LoBig, ISD::SETULT);
 947
 948   return DAG.getNode(ISD::MERGE_VALUES, DL, DAG.getVTList(VT,VT), Lo, Hi);
 949 }
 950
 951 SDValue R600TargetLowering::LowerSRXParts(SDValue Op, SelectionDAG &DAG) const {
 952   SDLoc DL(Op);
 953   EVT VT = Op.getValueType();
 954
 955   SDValue Lo = Op.getOperand(0);
 956   SDValue Hi = Op.getOperand(1);
 957   SDValue Shift = Op.getOperand(2);
 958   SDValue Zero = DAG.getConstant(0, VT);
 959   SDValue One  = DAG.getConstant(1, VT);
 960
 961   SDValue Width  = DAG.getConstant(VT.getSizeInBits(), VT);
 962   SDValue Width1 = DAG.getConstant(VT.getSizeInBits() - 1, VT);
 963   SDValue BigShift  = DAG.getNode(ISD::SUB, DL, VT, Shift, Width);
 964   SDValue CompShift = DAG.getNode(ISD::SUB, DL, VT, Width1, Shift);
 965
 966   // The dance around Width1 is necessary for 0 special case.
 967   // Without it the CompShift might be 32, producing incorrect results in
 968   // Overflow. So we do the shift in two steps, the alternative is to
 969   // add a conditional to filter the special case.
 970
 971   SDValue Overflow = DAG.getNode(ISD::SHL, DL, VT, Hi, CompShift);
 972   Overflow = DAG.getNode(ISD::SHL, DL, VT, Overflow, One);
 973
 974   // TODO: SRA support here
 975   SDValue HiSmall = DAG.getNode(ISD::SRL, DL, VT, Hi, Shift);
 976   SDValue LoSmall = DAG.getNode(ISD::SRL, DL, VT, Lo, Shift);
 977   LoSmall = DAG.getNode(ISD::OR, DL, VT, LoSmall, Overflow);
 978
 979   // TODO: SRA support here
 980   SDValue LoBig = DAG.getNode(ISD::SRL, DL, VT, Hi, BigShift);
 981   SDValue HiBig = Zero;
 982
 983   Hi = DAG.getSelectCC(DL, Shift, Width, HiSmall, HiBig, ISD::SETULT);
 984   Lo = DAG.getSelectCC(DL, Shift, Width, LoSmall, LoBig, ISD::SETULT);
 985
 986   return DAG.getNode(ISD::MERGE_VALUES, DL, DAG.getVTList(VT,VT), Lo, Hi);
 987 }
 988
 989 SDValue R600TargetLowering::LowerFPTOUINT(SDValue Op, SelectionDAG &DAG) const {
 990   return DAG.getNode(
 991       ISD::SETCC,
 992       SDLoc(Op),
 993       MVT::i1,
 994       Op, DAG.getConstantFP(0.0f, MVT::f32),
 995       DAG.getCondCode(ISD::SETNE)
 996       );
 997 }
 998
 999 SDValue R600TargetLowering::LowerImplicitParameter(SelectionDAG &DAG, EVT VT,
1000                                                    SDLoc DL,
1001                                                    unsigned DwordOffset) const {
1002   unsigned ByteOffset = DwordOffset * 4;
1003   PointerType * PtrType = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
1004                                       AMDGPUAS::CONSTANT_BUFFER_0);
1005
1006   // We shouldn't be using an offset wider than 16-bits for implicit parameters.
1007   assert(isInt<16>(ByteOffset));
1008
1009   return DAG.getLoad(VT, DL, DAG.getEntryNode(),
1010                      DAG.getConstant(ByteOffset, MVT::i32), // PTR
1011                      MachinePointerInfo(ConstantPointerNull::get(PtrType)),
1012                      false, false, false, 0);
1013 }
1014
1015 bool R600TargetLowering::isZero(SDValue Op) const {
1016   if(ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
1017     return Cst->isNullValue();
1018   } else if(ConstantFPSDNode *CstFP = dyn_cast<ConstantFPSDNode>(Op)){
1019     return CstFP->isZero();
1020   } else {
1021     return false;
1022   }
1023 }
1024
1025 SDValue R600TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
1026   SDLoc DL(Op);
1027   EVT VT = Op.getValueType();
1028
1029   SDValue LHS = Op.getOperand(0);
1030   SDValue RHS = Op.getOperand(1);
1031   SDValue True = Op.getOperand(2);
1032   SDValue False = Op.getOperand(3);
1033   SDValue CC = Op.getOperand(4);
1034   SDValue Temp;
1035
1036   // LHS and RHS are guaranteed to be the same value type
1037   EVT CompareVT = LHS.getValueType();
1038
1039   // Check if we can lower this to a native operation.
1040
1041   // Try to lower to a SET* instruction:
1042   //
1043   // SET* can match the following patterns:
1044   //
1045   // select_cc f32, f32, -1,  0, cc_supported
1046   // select_cc f32, f32, 1.0f, 0.0f, cc_supported
1047   // select_cc i32, i32, -1,  0, cc_supported
1048   //
1049
1050   // Move hardware True/False values to the correct operand.
1051   ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
1052   ISD::CondCode InverseCC =
1053      ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32);
1054   if (isHWTrueValue(False) && isHWFalseValue(True)) {
1055     if (isCondCodeLegal(InverseCC, CompareVT.getSimpleVT())) {
1056       std::swap(False, True);
1057       CC = DAG.getCondCode(InverseCC);
1058     } else {
1059       ISD::CondCode SwapInvCC = ISD::getSetCCSwappedOperands(InverseCC);
1060       if (isCondCodeLegal(SwapInvCC, CompareVT.getSimpleVT())) {
1061         std::swap(False, True);
1062         std::swap(LHS, RHS);
1063         CC = DAG.getCondCode(SwapInvCC);
1064       }
1065     }
1066   }
1067
1068   if (isHWTrueValue(True) && isHWFalseValue(False) &&
1069       (CompareVT == VT || VT == MVT::i32)) {
1070     // This can be matched by a SET* instruction.
1071     return DAG.getNode(ISD::SELECT_CC, DL, VT, LHS, RHS, True, False, CC);
1072   }
1073
1074   // Try to lower to a CND* instruction:
1075   //
1076   // CND* can match the following patterns:
1077   //
1078   // select_cc f32, 0.0, f32, f32, cc_supported
1079   // select_cc f32, 0.0, i32, i32, cc_supported
1080   // select_cc i32, 0,   f32, f32, cc_supported
1081   // select_cc i32, 0,   i32, i32, cc_supported
1082   //
1083
1084   // Try to move the zero value to the RHS
1085   if (isZero(LHS)) {
1086     ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
1087     // Try swapping the operands
1088     ISD::CondCode CCSwapped = ISD::getSetCCSwappedOperands(CCOpcode);
1089     if (isCondCodeLegal(CCSwapped, CompareVT.getSimpleVT())) {
1090       std::swap(LHS, RHS);
1091       CC = DAG.getCondCode(CCSwapped);
1092     } else {
1093       // Try inverting the conditon and then swapping the operands
1094       ISD::CondCode CCInv = ISD::getSetCCInverse(CCOpcode, CompareVT.isInteger());
1095       CCSwapped = ISD::getSetCCSwappedOperands(CCInv);
1096       if (isCondCodeLegal(CCSwapped, CompareVT.getSimpleVT())) {
1097         std::swap(True, False);
1098         std::swap(LHS, RHS);
1099         CC = DAG.getCondCode(CCSwapped);
1100       }
1101     }
1102   }
1103   if (isZero(RHS)) {
1104     SDValue Cond = LHS;
1105     SDValue Zero = RHS;
1106     ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
1107     if (CompareVT != VT) {
1108       // Bitcast True / False to the correct types.  This will end up being
1109       // a nop, but it allows us to define only a single pattern in the
1110       // .TD files for each CND* instruction rather than having to have
1111       // one pattern for integer True/False and one for fp True/False
1112       True = DAG.getNode(ISD::BITCAST, DL, CompareVT, True);
1113       False = DAG.getNode(ISD::BITCAST, DL, CompareVT, False);
1114     }
1115
1116     switch (CCOpcode) {
1117     case ISD::SETONE:
1118     case ISD::SETUNE:
1119     case ISD::SETNE:
1120       CCOpcode = ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32);
1121       Temp = True;
1122       True = False;
1123       False = Temp;
1124       break;
1125     default:
1126       break;
1127     }
1128     SDValue SelectNode = DAG.getNode(ISD::SELECT_CC, DL, CompareVT,
1129         Cond, Zero,
1130         True, False,
1131         DAG.getCondCode(CCOpcode));
1132     return DAG.getNode(ISD::BITCAST, DL, VT, SelectNode);
1133   }
1134
1135   // If we make it this for it means we have no native instructions to handle
1136   // this SELECT_CC, so we must lower it.
1137   SDValue HWTrue, HWFalse;
1138
1139   if (CompareVT == MVT::f32) {
1140     HWTrue = DAG.getConstantFP(1.0f, CompareVT);
1141     HWFalse = DAG.getConstantFP(0.0f, CompareVT);
1142   } else if (CompareVT == MVT::i32) {
1143     HWTrue = DAG.getConstant(-1, CompareVT);
1144     HWFalse = DAG.getConstant(0, CompareVT);
1145   }
1146   else {
1147     llvm_unreachable("Unhandled value type in LowerSELECT_CC");
1148   }
1149
1150   // Lower this unsupported SELECT_CC into a combination of two supported
1151   // SELECT_CC operations.
1152   SDValue Cond = DAG.getNode(ISD::SELECT_CC, DL, CompareVT, LHS, RHS, HWTrue, HWFalse, CC);
1153
1154   return DAG.getNode(ISD::SELECT_CC, DL, VT,
1155       Cond, HWFalse,
1156       True, False,
1157       DAG.getCondCode(ISD::SETNE));
1158 }
1159
1160 /// LLVM generates byte-addressed pointers.  For indirect addressing, we need to
1161 /// convert these pointers to a register index.  Each register holds
1162 /// 16 bytes, (4 x 32bit sub-register), but we need to take into account the
1163 /// \p StackWidth, which tells us how many of the 4 sub-registrers will be used
1164 /// for indirect addressing.
1165 SDValue R600TargetLowering::stackPtrToRegIndex(SDValue Ptr,
1166                                                unsigned StackWidth,
1167                                                SelectionDAG &DAG) const {
1168   unsigned SRLPad;
1169   switch(StackWidth) {
1170   case 1:
1171     SRLPad = 2;
1172     break;
1173   case 2:
1174     SRLPad = 3;
1175     break;
1176   case 4:
1177     SRLPad = 4;
1178     break;
1179   default: llvm_unreachable("Invalid stack width");
1180   }
1181
1182   return DAG.getNode(ISD::SRL, SDLoc(Ptr), Ptr.getValueType(), Ptr,
1183                      DAG.getConstant(SRLPad, MVT::i32));
1184 }
1185
1186 void R600TargetLowering::getStackAddress(unsigned StackWidth,
1187                                          unsigned ElemIdx,
1188                                          unsigned &Channel,
1189                                          unsigned &PtrIncr) const {
1190   switch (StackWidth) {
1191   default:
1192   case 1:
1193     Channel = 0;
1194     if (ElemIdx > 0) {
1195       PtrIncr = 1;
1196     } else {
1197       PtrIncr = 0;
1198     }
1199     break;
1200   case 2:
1201     Channel = ElemIdx % 2;
1202     if (ElemIdx == 2) {
1203       PtrIncr = 1;
1204     } else {
1205       PtrIncr = 0;
1206     }
1207     break;
1208   case 4:
1209     Channel = ElemIdx;
1210     PtrIncr = 0;
1211     break;
1212   }
1213 }
1214
1215 SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
1216   SDLoc DL(Op);
1217   StoreSDNode *StoreNode = cast<StoreSDNode>(Op);
1218   SDValue Chain = Op.getOperand(0);
1219   SDValue Value = Op.getOperand(1);
1220   SDValue Ptr = Op.getOperand(2);
1221
1222   SDValue Result = AMDGPUTargetLowering::LowerSTORE(Op, DAG);
1223   if (Result.getNode()) {
1224     return Result;
1225   }
1226
1227   if (StoreNode->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS) {
1228     if (StoreNode->isTruncatingStore()) {
1229       EVT VT = Value.getValueType();
1230       assert(VT.bitsLE(MVT::i32));
1231       EVT MemVT = StoreNode->getMemoryVT();
1232       SDValue MaskConstant;
1233       if (MemVT == MVT::i8) {
1234         MaskConstant = DAG.getConstant(0xFF, MVT::i32);
1235       } else {
1236         assert(MemVT == MVT::i16);
1237         MaskConstant = DAG.getConstant(0xFFFF, MVT::i32);
1238       }
1239       SDValue DWordAddr = DAG.getNode(ISD::SRL, DL, VT, Ptr,
1240                                       DAG.getConstant(2, MVT::i32));
1241       SDValue ByteIndex = DAG.getNode(ISD::AND, DL, Ptr.getValueType(), Ptr,
1242                                       DAG.getConstant(0x00000003, VT));
1243       SDValue TruncValue = DAG.getNode(ISD::AND, DL, VT, Value, MaskConstant);
1244       SDValue Shift = DAG.getNode(ISD::SHL, DL, VT, ByteIndex,
1245                                    DAG.getConstant(3, VT));
1246       SDValue ShiftedValue = DAG.getNode(ISD::SHL, DL, VT, TruncValue, Shift);
1247       SDValue Mask = DAG.getNode(ISD::SHL, DL, VT, MaskConstant, Shift);
1248       // XXX: If we add a 64-bit ZW register class, then we could use a 2 x i32
1249       // vector instead.
1250       SDValue Src[4] = {
1251         ShiftedValue,
1252         DAG.getConstant(0, MVT::i32),
1253         DAG.getConstant(0, MVT::i32),
1254         Mask
1255       };
1256       SDValue Input = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v4i32, Src);
1257       SDValue Args[3] = { Chain, Input, DWordAddr };
1258       return DAG.getMemIntrinsicNode(AMDGPUISD::STORE_MSKOR, DL,
1259                                      Op->getVTList(), Args, MemVT,
1260                                      StoreNode->getMemOperand());
1261     } else if (Ptr->getOpcode() != AMDGPUISD::DWORDADDR &&
1262                Value.getValueType().bitsGE(MVT::i32)) {
1263       // Convert pointer from byte address to dword address.
1264       Ptr = DAG.getNode(AMDGPUISD::DWORDADDR, DL, Ptr.getValueType(),
1265                         DAG.getNode(ISD::SRL, DL, Ptr.getValueType(),
1266                                     Ptr, DAG.getConstant(2, MVT::i32)));
1267
1268       if (StoreNode->isTruncatingStore() || StoreNode->isIndexed()) {
1269         llvm_unreachable("Truncated and indexed stores not supported yet");
1270       } else {
1271         Chain = DAG.getStore(Chain, DL, Value, Ptr, StoreNode->getMemOperand());
1272       }
1273       return Chain;
1274     }
1275   }
1276
1277   EVT ValueVT = Value.getValueType();
1278
1279   if (StoreNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) {
1280     return SDValue();
1281   }
1282
1283   SDValue Ret = AMDGPUTargetLowering::LowerSTORE(Op, DAG);
1284   if (Ret.getNode()) {
1285     return Ret;
1286   }
1287   // Lowering for indirect addressing
1288
1289   const MachineFunction &MF = DAG.getMachineFunction();
1290   const AMDGPUFrameLowering *TFL = static_cast<const AMDGPUFrameLowering*>(
1291                                          getTargetMachine().getFrameLowering());
1292   unsigned StackWidth = TFL->getStackWidth(MF);
1293
1294   Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG);
1295
1296   if (ValueVT.isVector()) {
1297     unsigned NumElemVT = ValueVT.getVectorNumElements();
1298     EVT ElemVT = ValueVT.getVectorElementType();
1299     SmallVector<SDValue, 4> Stores(NumElemVT);
1300
1301     assert(NumElemVT >= StackWidth && "Stack width cannot be greater than "
1302                                       "vector width in load");
1303
1304     for (unsigned i = 0; i < NumElemVT; ++i) {
1305       unsigned Channel, PtrIncr;
1306       getStackAddress(StackWidth, i, Channel, PtrIncr);
1307       Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr,
1308                         DAG.getConstant(PtrIncr, MVT::i32));
1309       SDValue Elem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ElemVT,
1310                                  Value, DAG.getConstant(i, MVT::i32));
1311
1312       Stores[i] = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other,
1313                               Chain, Elem, Ptr,
1314                               DAG.getTargetConstant(Channel, MVT::i32));
1315     }
1316      Chain =  DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores);
1317    } else {
1318     if (ValueVT == MVT::i8) {
1319       Value = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Value);
1320     }
1321     Chain = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other, Chain, Value, Ptr,
1322     DAG.getTargetConstant(0, MVT::i32)); // Channel
1323   }
1324
1325   return Chain;
1326 }
1327
1328 // return (512 + (kc_bank << 12)
1329 static int
1330 ConstantAddressBlock(unsigned AddressSpace) {
1331   switch (AddressSpace) {
1332   case AMDGPUAS::CONSTANT_BUFFER_0:
1333     return 512;
1334   case AMDGPUAS::CONSTANT_BUFFER_1:
1335     return 512 + 4096;
1336   case AMDGPUAS::CONSTANT_BUFFER_2:
1337     return 512 + 4096 * 2;
1338   case AMDGPUAS::CONSTANT_BUFFER_3:
1339     return 512 + 4096 * 3;
1340   case AMDGPUAS::CONSTANT_BUFFER_4:
1341     return 512 + 4096 * 4;
1342   case AMDGPUAS::CONSTANT_BUFFER_5:
1343     return 512 + 4096 * 5;
1344   case AMDGPUAS::CONSTANT_BUFFER_6:
1345     return 512 + 4096 * 6;
1346   case AMDGPUAS::CONSTANT_BUFFER_7:
1347     return 512 + 4096 * 7;
1348   case AMDGPUAS::CONSTANT_BUFFER_8:
1349     return 512 + 4096 * 8;
1350   case AMDGPUAS::CONSTANT_BUFFER_9:
1351     return 512 + 4096 * 9;
1352   case AMDGPUAS::CONSTANT_BUFFER_10:
1353     return 512 + 4096 * 10;
1354   case AMDGPUAS::CONSTANT_BUFFER_11:
1355     return 512 + 4096 * 11;
1356   case AMDGPUAS::CONSTANT_BUFFER_12:
1357     return 512 + 4096 * 12;
1358   case AMDGPUAS::CONSTANT_BUFFER_13:
1359     return 512 + 4096 * 13;
1360   case AMDGPUAS::CONSTANT_BUFFER_14:
1361     return 512 + 4096 * 14;
1362   case AMDGPUAS::CONSTANT_BUFFER_15:
1363     return 512 + 4096 * 15;
1364   default:
1365     return -1;
1366   }
1367 }
1368
1369 SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const
1370 {
1371   EVT VT = Op.getValueType();
1372   SDLoc DL(Op);
1373   LoadSDNode *LoadNode = cast<LoadSDNode>(Op);
1374   SDValue Chain = Op.getOperand(0);
1375   SDValue Ptr = Op.getOperand(1);
1376   SDValue LoweredLoad;
1377
1378   SDValue Ret = AMDGPUTargetLowering::LowerLOAD(Op, DAG);
1379   if (Ret.getNode()) {
1380     SDValue Ops[2] = {
1381       Ret,
1382       Chain
1383     };
1384     return DAG.getMergeValues(Ops, DL);
1385   }
1386
1387
1388   if (LoadNode->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS && VT.isVector()) {
1389     SDValue MergedValues[2] = {
1390       SplitVectorLoad(Op, DAG),
1391       Chain
1392     };
1393     return DAG.getMergeValues(MergedValues, DL);
1394   }
1395
1396   int ConstantBlock = ConstantAddressBlock(LoadNode->getAddressSpace());
1397   if (ConstantBlock > -1 &&
1398       ((LoadNode->getExtensionType() == ISD::NON_EXTLOAD) ||
1399        (LoadNode->getExtensionType() == ISD::ZEXTLOAD))) {
1400     SDValue Result;
1401     if (isa<ConstantExpr>(LoadNode->getMemOperand()->getValue()) ||
1402         isa<Constant>(LoadNode->getMemOperand()->getValue()) ||
1403         isa<ConstantSDNode>(Ptr)) {
1404       SDValue Slots[4];
1405       for (unsigned i = 0; i < 4; i++) {
1406         // We want Const position encoded with the following formula :
1407         // (((512 + (kc_bank << 12) + const_index) << 2) + chan)
1408         // const_index is Ptr computed by llvm using an alignment of 16.
1409         // Thus we add (((512 + (kc_bank << 12)) + chan ) * 4 here and
1410         // then div by 4 at the ISel step
1411         SDValue NewPtr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
1412             DAG.getConstant(4 * i + ConstantBlock * 16, MVT::i32));
1413         Slots[i] = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::i32, NewPtr);
1414       }
1415       EVT NewVT = MVT::v4i32;
1416       unsigned NumElements = 4;
1417       if (VT.isVector()) {
1418         NewVT = VT;
1419         NumElements = VT.getVectorNumElements();
1420       }
1421       Result = DAG.getNode(ISD::BUILD_VECTOR, DL, NewVT,
1422                            makeArrayRef(Slots, NumElements));
1423     } else {
1424       // non-constant ptr can't be folded, keeps it as a v4f32 load
1425       Result = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::v4i32,
1426           DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr, DAG.getConstant(4, MVT::i32)),
1427           DAG.getConstant(LoadNode->getAddressSpace() -
1428                           AMDGPUAS::CONSTANT_BUFFER_0, MVT::i32)
1429           );
1430     }
1431
1432     if (!VT.isVector()) {
1433       Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Result,
1434           DAG.getConstant(0, MVT::i32));
1435     }
1436
1437     SDValue MergedValues[2] = {
1438       Result,
1439       Chain
1440     };
1441     return DAG.getMergeValues(MergedValues, DL);
1442   }
1443
1444   // For most operations returning SDValue() will result in the node being
1445   // expanded by the DAG Legalizer. This is not the case for ISD::LOAD, so we
1446   // need to manually expand loads that may be legal in some address spaces and
1447   // illegal in others. SEXT loads from CONSTANT_BUFFER_0 are supported for
1448   // compute shaders, since the data is sign extended when it is uploaded to the
1449   // buffer. However SEXT loads from other address spaces are not supported, so
1450   // we need to expand them here.
1451   if (LoadNode->getExtensionType() == ISD::SEXTLOAD) {
1452     EVT MemVT = LoadNode->getMemoryVT();
1453     assert(!MemVT.isVector() && (MemVT == MVT::i16 || MemVT == MVT::i8));
1454     SDValue ShiftAmount =
1455           DAG.getConstant(VT.getSizeInBits() - MemVT.getSizeInBits(), MVT::i32);
1456     SDValue NewLoad = DAG.getExtLoad(ISD::EXTLOAD, DL, VT, Chain, Ptr,
1457                                   LoadNode->getPointerInfo(), MemVT,
1458                                   LoadNode->isVolatile(),
1459                                   LoadNode->isNonTemporal(),
1460                                   LoadNode->getAlignment());
1461     SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, NewLoad, ShiftAmount);
1462     SDValue Sra = DAG.getNode(ISD::SRA, DL, VT, Shl, ShiftAmount);
1463
1464     SDValue MergedValues[2] = { Sra, Chain };
1465     return DAG.getMergeValues(MergedValues, DL);
1466   }
1467
1468   if (LoadNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) {
1469     return SDValue();
1470   }
1471
1472   // Lowering for indirect addressing
1473   const MachineFunction &MF = DAG.getMachineFunction();
1474   const AMDGPUFrameLowering *TFL = static_cast<const AMDGPUFrameLowering*>(
1475                                          getTargetMachine().getFrameLowering());
1476   unsigned StackWidth = TFL->getStackWidth(MF);
1477
1478   Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG);
1479
1480   if (VT.isVector()) {
1481     unsigned NumElemVT = VT.getVectorNumElements();
1482     EVT ElemVT = VT.getVectorElementType();
1483     SDValue Loads[4];
1484
1485     assert(NumElemVT >= StackWidth && "Stack width cannot be greater than "
1486                                       "vector width in load");
1487
1488     for (unsigned i = 0; i < NumElemVT; ++i) {
1489       unsigned Channel, PtrIncr;
1490       getStackAddress(StackWidth, i, Channel, PtrIncr);
1491       Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr,
1492                         DAG.getConstant(PtrIncr, MVT::i32));
1493       Loads[i] = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, ElemVT,
1494                              Chain, Ptr,
1495                              DAG.getTargetConstant(Channel, MVT::i32),
1496                              Op.getOperand(2));
1497     }
1498     for (unsigned i = NumElemVT; i < 4; ++i) {
1499       Loads[i] = DAG.getUNDEF(ElemVT);
1500     }
1501     EVT TargetVT = EVT::getVectorVT(*DAG.getContext(), ElemVT, 4);
1502     LoweredLoad = DAG.getNode(ISD::BUILD_VECTOR, DL, TargetVT, Loads);
1503   } else {
1504     LoweredLoad = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, VT,
1505                               Chain, Ptr,
1506                               DAG.getTargetConstant(0, MVT::i32), // Channel
1507                               Op.getOperand(2));
1508   }
1509
1510   SDValue Ops[2] = {
1511     LoweredLoad,
1512     Chain
1513   };
1514
1515   return DAG.getMergeValues(Ops, DL);
1516 }
1517
1518 /// XXX Only kernel functions are supported, so we can assume for now that
1519 /// every function is a kernel function, but in the future we should use
1520 /// separate calling conventions for kernel and non-kernel functions.
1521 SDValue R600TargetLowering::LowerFormalArguments(
1522                                       SDValue Chain,
1523                                       CallingConv::ID CallConv,
1524                                       bool isVarArg,
1525                                       const SmallVectorImpl<ISD::InputArg> &Ins,
1526                                       SDLoc DL, SelectionDAG &DAG,
1527                                       SmallVectorImpl<SDValue> &InVals) const {
1528   SmallVector<CCValAssign, 16> ArgLocs;
1529   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
1530                  getTargetMachine(), ArgLocs, *DAG.getContext());
1531   MachineFunction &MF = DAG.getMachineFunction();
1532   unsigned ShaderType = MF.getInfo<R600MachineFunctionInfo>()->ShaderType;
1533
1534   SmallVector<ISD::InputArg, 8> LocalIns;
1535
1536   getOriginalFunctionArgs(DAG, MF.getFunction(), Ins, LocalIns);
1537
1538   AnalyzeFormalArguments(CCInfo, LocalIns);
1539
1540   for (unsigned i = 0, e = Ins.size(); i < e; ++i) {
1541     CCValAssign &VA = ArgLocs[i];
1542     EVT VT = Ins[i].VT;
1543     EVT MemVT = LocalIns[i].VT;
1544
1545     if (ShaderType != ShaderType::COMPUTE) {
1546       unsigned Reg = MF.addLiveIn(VA.getLocReg(), &AMDGPU::R600_Reg128RegClass);
1547       SDValue Register = DAG.getCopyFromReg(Chain, DL, Reg, VT);
1548       InVals.push_back(Register);
1549       continue;
1550     }
1551
1552     PointerType *PtrTy = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
1553                                                    AMDGPUAS::CONSTANT_BUFFER_0);
1554
1555     // i64 isn't a legal type, so the register type used ends up as i32, which
1556     // isn't expected here. It attempts to create this sextload, but it ends up
1557     // being invalid. Somehow this seems to work with i64 arguments, but breaks
1558     // for <1 x i64>.
1559
1560     // The first 36 bytes of the input buffer contains information about
1561     // thread group and global sizes.
1562
1563     // FIXME: This should really check the extload type, but the handling of
1564     // extload vecto parameters seems to be broken.
1565     //ISD::LoadExtType Ext = Ins[i].Flags.isSExt() ? ISD::SEXTLOAD : ISD::ZEXTLOAD;
1566     ISD::LoadExtType Ext = ISD::SEXTLOAD;
1567     SDValue Arg = DAG.getExtLoad(Ext, DL, VT, Chain,
1568                                  DAG.getConstant(36 + VA.getLocMemOffset(), MVT::i32),
1569                                  MachinePointerInfo(UndefValue::get(PtrTy)),
1570                                  MemVT, false, false, 4);
1571
1572     // 4 is the preferred alignment for the CONSTANT memory space.
1573     InVals.push_back(Arg);
1574   }
1575   return Chain;
1576 }
1577
1578 EVT R600TargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const {
1579    if (!VT.isVector())
1580      return MVT::i32;
1581    return VT.changeVectorElementTypeToInteger();
1582 }
1583
1584 static SDValue CompactSwizzlableVector(
1585   SelectionDAG &DAG, SDValue VectorEntry,
1586   DenseMap<unsigned, unsigned> &RemapSwizzle) {
1587   assert(VectorEntry.getOpcode() == ISD::BUILD_VECTOR);
1588   assert(RemapSwizzle.empty());
1589   SDValue NewBldVec[4] = {
1590     VectorEntry.getOperand(0),
1591     VectorEntry.getOperand(1),
1592     VectorEntry.getOperand(2),
1593     VectorEntry.getOperand(3)
1594   };
1595
1596   for (unsigned i = 0; i < 4; i++) {
1597     if (NewBldVec[i].getOpcode() == ISD::UNDEF)
1598       // We mask write here to teach later passes that the ith element of this
1599       // vector is undef. Thus we can use it to reduce 128 bits reg usage,
1600       // break false dependencies and additionnaly make assembly easier to read.
1601       RemapSwizzle[i] = 7; // SEL_MASK_WRITE
1602     if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(NewBldVec[i])) {
1603       if (C->isZero()) {
1604         RemapSwizzle[i] = 4; // SEL_0
1605         NewBldVec[i] = DAG.getUNDEF(MVT::f32);
1606       } else if (C->isExactlyValue(1.0)) {
1607         RemapSwizzle[i] = 5; // SEL_1
1608         NewBldVec[i] = DAG.getUNDEF(MVT::f32);
1609       }
1610     }
1611
1612     if (NewBldVec[i].getOpcode() == ISD::UNDEF)
1613       continue;
1614     for (unsigned j = 0; j < i; j++) {
1615       if (NewBldVec[i] == NewBldVec[j]) {
1616         NewBldVec[i] = DAG.getUNDEF(NewBldVec[i].getValueType());
1617         RemapSwizzle[i] = j;
1618         break;
1619       }
1620     }
1621   }
1622
1623   return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(VectorEntry),
1624                      VectorEntry.getValueType(), NewBldVec);
1625 }
1626
1627 static SDValue ReorganizeVector(SelectionDAG &DAG, SDValue VectorEntry,
1628                                 DenseMap<unsigned, unsigned> &RemapSwizzle) {
1629   assert(VectorEntry.getOpcode() == ISD::BUILD_VECTOR);
1630   assert(RemapSwizzle.empty());
1631   SDValue NewBldVec[4] = {
1632       VectorEntry.getOperand(0),
1633       VectorEntry.getOperand(1),
1634       VectorEntry.getOperand(2),
1635       VectorEntry.getOperand(3)
1636   };
1637   bool isUnmovable[4] = { false, false, false, false };
1638   for (unsigned i = 0; i < 4; i++) {
1639     RemapSwizzle[i] = i;
1640     if (NewBldVec[i].getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
1641       unsigned Idx = dyn_cast<ConstantSDNode>(NewBldVec[i].getOperand(1))
1642           ->getZExtValue();
1643       if (i == Idx)
1644         isUnmovable[Idx] = true;
1645     }
1646   }
1647
1648   for (unsigned i = 0; i < 4; i++) {
1649     if (NewBldVec[i].getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
1650       unsigned Idx = dyn_cast<ConstantSDNode>(NewBldVec[i].getOperand(1))
1651           ->getZExtValue();
1652       if (isUnmovable[Idx])
1653         continue;
1654       // Swap i and Idx
1655       std::swap(NewBldVec[Idx], NewBldVec[i]);
1656       std::swap(RemapSwizzle[i], RemapSwizzle[Idx]);
1657       break;
1658     }
1659   }
1660
1661   return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(VectorEntry),
1662                      VectorEntry.getValueType(), NewBldVec);
1663 }
1664
1665
1666 SDValue R600TargetLowering::OptimizeSwizzle(SDValue BuildVector,
1667 SDValue Swz[4], SelectionDAG &DAG) const {
1668   assert(BuildVector.getOpcode() == ISD::BUILD_VECTOR);
1669   // Old -> New swizzle values
1670   DenseMap<unsigned, unsigned> SwizzleRemap;
1671
1672   BuildVector = CompactSwizzlableVector(DAG, BuildVector, SwizzleRemap);
1673   for (unsigned i = 0; i < 4; i++) {
1674     unsigned Idx = dyn_cast<ConstantSDNode>(Swz[i])->getZExtValue();
1675     if (SwizzleRemap.find(Idx) != SwizzleRemap.end())
1676       Swz[i] = DAG.getConstant(SwizzleRemap[Idx], MVT::i32);
1677   }
1678
1679   SwizzleRemap.clear();
1680   BuildVector = ReorganizeVector(DAG, BuildVector, SwizzleRemap);
1681   for (unsigned i = 0; i < 4; i++) {
1682     unsigned Idx = dyn_cast<ConstantSDNode>(Swz[i])->getZExtValue();
1683     if (SwizzleRemap.find(Idx) != SwizzleRemap.end())
1684       Swz[i] = DAG.getConstant(SwizzleRemap[Idx], MVT::i32);
1685   }
1686
1687   return BuildVector;
1688 }
1689
1690
1691 //===----------------------------------------------------------------------===//
1692 // Custom DAG Optimizations
1693 //===----------------------------------------------------------------------===//
1694
1695 SDValue R600TargetLowering::PerformDAGCombine(SDNode *N,
1696                                               DAGCombinerInfo &DCI) const {
1697   SelectionDAG &DAG = DCI.DAG;
1698
1699   switch (N->getOpcode()) {
1700   default: return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
1701   // (f32 fp_round (f64 uint_to_fp a)) -> (f32 uint_to_fp a)
1702   case ISD::FP_ROUND: {
1703       SDValue Arg = N->getOperand(0);
1704       if (Arg.getOpcode() == ISD::UINT_TO_FP && Arg.getValueType() == MVT::f64) {
1705         return DAG.getNode(ISD::UINT_TO_FP, SDLoc(N), N->getValueType(0),
1706                            Arg.getOperand(0));
1707       }
1708       break;
1709     }
1710
1711   // (i32 fp_to_sint (fneg (select_cc f32, f32, 1.0, 0.0 cc))) ->
1712   // (i32 select_cc f32, f32, -1, 0 cc)
1713   //
1714   // Mesa's GLSL frontend generates the above pattern a lot and we can lower
1715   // this to one of the SET*_DX10 instructions.
1716   case ISD::FP_TO_SINT: {
1717     SDValue FNeg = N->getOperand(0);
1718     if (FNeg.getOpcode() != ISD::FNEG) {
1719       return SDValue();
1720     }
1721     SDValue SelectCC = FNeg.getOperand(0);
1722     if (SelectCC.getOpcode() != ISD::SELECT_CC ||
1723         SelectCC.getOperand(0).getValueType() != MVT::f32 || // LHS
1724         SelectCC.getOperand(2).getValueType() != MVT::f32 || // True
1725         !isHWTrueValue(SelectCC.getOperand(2)) ||
1726         !isHWFalseValue(SelectCC.getOperand(3))) {
1727       return SDValue();
1728     }
1729
1730     return DAG.getNode(ISD::SELECT_CC, SDLoc(N), N->getValueType(0),
1731                            SelectCC.getOperand(0), // LHS
1732                            SelectCC.getOperand(1), // RHS
1733                            DAG.getConstant(-1, MVT::i32), // True
1734                            DAG.getConstant(0, MVT::i32),  // Flase
1735                            SelectCC.getOperand(4)); // CC
1736
1737     break;
1738   }
1739
1740   // insert_vector_elt (build_vector elt0, ... , eltN), NewEltIdx, idx
1741   // => build_vector elt0, ... , NewEltIdx, ... , eltN
1742   case ISD::INSERT_VECTOR_ELT: {
1743     SDValue InVec = N->getOperand(0);
1744     SDValue InVal = N->getOperand(1);
1745     SDValue EltNo = N->getOperand(2);
1746     SDLoc dl(N);
1747
1748     // If the inserted element is an UNDEF, just use the input vector.
1749     if (InVal.getOpcode() == ISD::UNDEF)
1750       return InVec;
1751
1752     EVT VT = InVec.getValueType();
1753
1754     // If we can't generate a legal BUILD_VECTOR, exit
1755     if (!isOperationLegal(ISD::BUILD_VECTOR, VT))
1756       return SDValue();
1757
1758     // Check that we know which element is being inserted
1759     if (!isa<ConstantSDNode>(EltNo))
1760       return SDValue();
1761     unsigned Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
1762
1763     // Check that the operand is a BUILD_VECTOR (or UNDEF, which can essentially
1764     // be converted to a BUILD_VECTOR).  Fill in the Ops vector with the
1765     // vector elements.
1766     SmallVector<SDValue, 8> Ops;
1767     if (InVec.getOpcode() == ISD::BUILD_VECTOR) {
1768       Ops.append(InVec.getNode()->op_begin(),
1769                  InVec.getNode()->op_end());
1770     } else if (InVec.getOpcode() == ISD::UNDEF) {
1771       unsigned NElts = VT.getVectorNumElements();
1772       Ops.append(NElts, DAG.getUNDEF(InVal.getValueType()));
1773     } else {
1774       return SDValue();
1775     }
1776
1777     // Insert the element
1778     if (Elt < Ops.size()) {
1779       // All the operands of BUILD_VECTOR must have the same type;
1780       // we enforce that here.
1781       EVT OpVT = Ops[0].getValueType();
1782       if (InVal.getValueType() != OpVT)
1783         InVal = OpVT.bitsGT(InVal.getValueType()) ?
1784           DAG.getNode(ISD::ANY_EXTEND, dl, OpVT, InVal) :
1785           DAG.getNode(ISD::TRUNCATE, dl, OpVT, InVal);
1786       Ops[Elt] = InVal;
1787     }
1788
1789     // Return the new vector
1790     return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
1791   }
1792
1793   // Extract_vec (Build_vector) generated by custom lowering
1794   // also needs to be customly combined
1795   case ISD::EXTRACT_VECTOR_ELT: {
1796     SDValue Arg = N->getOperand(0);
1797     if (Arg.getOpcode() == ISD::BUILD_VECTOR) {
1798       if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
1799         unsigned Element = Const->getZExtValue();
1800         return Arg->getOperand(Element);
1801       }
1802     }
1803     if (Arg.getOpcode() == ISD::BITCAST &&
1804         Arg.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
1805       if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
1806         unsigned Element = Const->getZExtValue();
1807         return DAG.getNode(ISD::BITCAST, SDLoc(N), N->getVTList(),
1808             Arg->getOperand(0).getOperand(Element));
1809       }
1810     }
1811   }
1812
1813   case ISD::SELECT_CC: {
1814     // Try common optimizations
1815     SDValue Ret = AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
1816     if (Ret.getNode())
1817       return Ret;
1818
1819     // fold selectcc (selectcc x, y, a, b, cc), b, a, b, seteq ->
1820     //      selectcc x, y, a, b, inv(cc)
1821     //
1822     // fold selectcc (selectcc x, y, a, b, cc), b, a, b, setne ->
1823     //      selectcc x, y, a, b, cc
1824     SDValue LHS = N->getOperand(0);
1825     if (LHS.getOpcode() != ISD::SELECT_CC) {
1826       return SDValue();
1827     }
1828
1829     SDValue RHS = N->getOperand(1);
1830     SDValue True = N->getOperand(2);
1831     SDValue False = N->getOperand(3);
1832     ISD::CondCode NCC = cast<CondCodeSDNode>(N->getOperand(4))->get();
1833
1834     if (LHS.getOperand(2).getNode() != True.getNode() ||
1835         LHS.getOperand(3).getNode() != False.getNode() ||
1836         RHS.getNode() != False.getNode()) {
1837       return SDValue();
1838     }
1839
1840     switch (NCC) {
1841     default: return SDValue();
1842     case ISD::SETNE: return LHS;
1843     case ISD::SETEQ: {
1844       ISD::CondCode LHSCC = cast<CondCodeSDNode>(LHS.getOperand(4))->get();
1845       LHSCC = ISD::getSetCCInverse(LHSCC,
1846                                   LHS.getOperand(0).getValueType().isInteger());
1847       if (DCI.isBeforeLegalizeOps() ||
1848           isCondCodeLegal(LHSCC, LHS.getOperand(0).getSimpleValueType()))
1849         return DAG.getSelectCC(SDLoc(N),
1850                                LHS.getOperand(0),
1851                                LHS.getOperand(1),
1852                                LHS.getOperand(2),
1853                                LHS.getOperand(3),
1854                                LHSCC);
1855       break;
1856     }
1857     }
1858     return SDValue();
1859   }
1860
1861   case AMDGPUISD::EXPORT: {
1862     SDValue Arg = N->getOperand(1);
1863     if (Arg.getOpcode() != ISD::BUILD_VECTOR)
1864       break;
1865
1866     SDValue NewArgs[8] = {
1867       N->getOperand(0), // Chain
1868       SDValue(),
1869       N->getOperand(2), // ArrayBase
1870       N->getOperand(3), // Type
1871       N->getOperand(4), // SWZ_X
1872       N->getOperand(5), // SWZ_Y
1873       N->getOperand(6), // SWZ_Z
1874       N->getOperand(7) // SWZ_W
1875     };
1876     SDLoc DL(N);
1877     NewArgs[1] = OptimizeSwizzle(N->getOperand(1), &NewArgs[4], DAG);
1878     return DAG.getNode(AMDGPUISD::EXPORT, DL, N->getVTList(), NewArgs);
1879   }
1880   case AMDGPUISD::TEXTURE_FETCH: {
1881     SDValue Arg = N->getOperand(1);
1882     if (Arg.getOpcode() != ISD::BUILD_VECTOR)
1883       break;
1884
1885     SDValue NewArgs[19] = {
1886       N->getOperand(0),
1887       N->getOperand(1),
1888       N->getOperand(2),
1889       N->getOperand(3),
1890       N->getOperand(4),
1891       N->getOperand(5),
1892       N->getOperand(6),
1893       N->getOperand(7),
1894       N->getOperand(8),
1895       N->getOperand(9),
1896       N->getOperand(10),
1897       N->getOperand(11),
1898       N->getOperand(12),
1899       N->getOperand(13),
1900       N->getOperand(14),
1901       N->getOperand(15),
1902       N->getOperand(16),
1903       N->getOperand(17),
1904       N->getOperand(18),
1905     };
1906     NewArgs[1] = OptimizeSwizzle(N->getOperand(1), &NewArgs[2], DAG);
1907     return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, SDLoc(N), N->getVTList(),
1908         NewArgs);
1909   }
1910   }
1911
1912   return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
1913 }
1914
1915 static bool
1916 FoldOperand(SDNode *ParentNode, unsigned SrcIdx, SDValue &Src, SDValue &Neg,
1917             SDValue &Abs, SDValue &Sel, SDValue &Imm, SelectionDAG &DAG) {
1918   const R600InstrInfo *TII =
1919       static_cast<const R600InstrInfo *>(DAG.getTarget().getInstrInfo());
1920   if (!Src.isMachineOpcode())
1921     return false;
1922   switch (Src.getMachineOpcode()) {
1923   case AMDGPU::FNEG_R600:
1924     if (!Neg.getNode())
1925       return false;
1926     Src = Src.getOperand(0);
1927     Neg = DAG.getTargetConstant(1, MVT::i32);
1928     return true;
1929   case AMDGPU::FABS_R600:
1930     if (!Abs.getNode())
1931       return false;
1932     Src = Src.getOperand(0);
1933     Abs = DAG.getTargetConstant(1, MVT::i32);
1934     return true;
1935   case AMDGPU::CONST_COPY: {
1936     unsigned Opcode = ParentNode->getMachineOpcode();
1937     bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1;
1938
1939     if (!Sel.getNode())
1940       return false;
1941
1942     SDValue CstOffset = Src.getOperand(0);
1943     if (ParentNode->getValueType(0).isVector())
1944       return false;
1945
1946     // Gather constants values
1947     int SrcIndices[] = {
1948       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0),
1949       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1),
1950       TII->getOperandIdx(Opcode, AMDGPU::OpName::src2),
1951       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_X),
1952       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Y),
1953       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Z),
1954       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_W),
1955       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_X),
1956       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Y),
1957       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Z),
1958       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_W)
1959     };
1960     std::vector<unsigned> Consts;
1961     for (int OtherSrcIdx : SrcIndices) {
1962       int OtherSelIdx = TII->getSelIdx(Opcode, OtherSrcIdx);
1963       if (OtherSrcIdx < 0 || OtherSelIdx < 0)
1964         continue;
1965       if (HasDst) {
1966         OtherSrcIdx--;
1967         OtherSelIdx--;
1968       }
1969       if (RegisterSDNode *Reg =
1970           dyn_cast<RegisterSDNode>(ParentNode->getOperand(OtherSrcIdx))) {
1971         if (Reg->getReg() == AMDGPU::ALU_CONST) {
1972           ConstantSDNode *Cst
1973             = cast<ConstantSDNode>(ParentNode->getOperand(OtherSelIdx));
1974           Consts.push_back(Cst->getZExtValue());
1975         }
1976       }
1977     }
1978
1979     ConstantSDNode *Cst = cast<ConstantSDNode>(CstOffset);
1980     Consts.push_back(Cst->getZExtValue());
1981     if (!TII->fitsConstReadLimitations(Consts)) {
1982       return false;
1983     }
1984
1985     Sel = CstOffset;
1986     Src = DAG.getRegister(AMDGPU::ALU_CONST, MVT::f32);
1987     return true;
1988   }
1989   case AMDGPU::MOV_IMM_I32:
1990   case AMDGPU::MOV_IMM_F32: {
1991     unsigned ImmReg = AMDGPU::ALU_LITERAL_X;
1992     uint64_t ImmValue = 0;
1993
1994
1995     if (Src.getMachineOpcode() == AMDGPU::MOV_IMM_F32) {
1996       ConstantFPSDNode *FPC = dyn_cast<ConstantFPSDNode>(Src.getOperand(0));
1997       float FloatValue = FPC->getValueAPF().convertToFloat();
1998       if (FloatValue == 0.0) {
1999         ImmReg = AMDGPU::ZERO;
2000       } else if (FloatValue == 0.5) {
2001         ImmReg = AMDGPU::HALF;
2002       } else if (FloatValue == 1.0) {
2003         ImmReg = AMDGPU::ONE;
2004       } else {
2005         ImmValue = FPC->getValueAPF().bitcastToAPInt().getZExtValue();
2006       }
2007     } else {
2008       ConstantSDNode *C = dyn_cast<ConstantSDNode>(Src.getOperand(0));
2009       uint64_t Value = C->getZExtValue();
2010       if (Value == 0) {
2011         ImmReg = AMDGPU::ZERO;
2012       } else if (Value == 1) {
2013         ImmReg = AMDGPU::ONE_INT;
2014       } else {
2015         ImmValue = Value;
2016       }
2017     }
2018
2019     // Check that we aren't already using an immediate.
2020     // XXX: It's possible for an instruction to have more than one
2021     // immediate operand, but this is not supported yet.
2022     if (ImmReg == AMDGPU::ALU_LITERAL_X) {
2023       if (!Imm.getNode())
2024         return false;
2025       ConstantSDNode *C = dyn_cast<ConstantSDNode>(Imm);
2026       assert(C);
2027       if (C->getZExtValue())
2028         return false;
2029       Imm = DAG.getTargetConstant(ImmValue, MVT::i32);
2030     }
2031     Src = DAG.getRegister(ImmReg, MVT::i32);
2032     return true;
2033   }
2034   default:
2035     return false;
2036   }
2037 }
2038
2039
2040 /// \brief Fold the instructions after selecting them
2041 SDNode *R600TargetLowering::PostISelFolding(MachineSDNode *Node,
2042                                             SelectionDAG &DAG) const {
2043   const R600InstrInfo *TII =
2044       static_cast<const R600InstrInfo *>(DAG.getTarget().getInstrInfo());
2045   if (!Node->isMachineOpcode())
2046     return Node;
2047   unsigned Opcode = Node->getMachineOpcode();
2048   SDValue FakeOp;
2049
2050   std::vector<SDValue> Ops;
2051   for(SDNode::op_iterator I = Node->op_begin(), E = Node->op_end();
2052               I != E; ++I)
2053           Ops.push_back(*I);
2054
2055   if (Opcode == AMDGPU::DOT_4) {
2056     int OperandIdx[] = {
2057       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_X),
2058       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Y),
2059       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Z),
2060       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_W),
2061       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_X),
2062       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Y),
2063       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Z),
2064       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_W)
2065         };
2066     int NegIdx[] = {
2067       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_X),
2068       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_Y),
2069       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_Z),
2070       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_W),
2071       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_X),
2072       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_Y),
2073       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_Z),
2074       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_W)
2075     };
2076     int AbsIdx[] = {
2077       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_X),
2078       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_Y),
2079       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_Z),
2080       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_W),
2081       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_X),
2082       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_Y),
2083       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_Z),
2084       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_W)
2085     };
2086     for (unsigned i = 0; i < 8; i++) {
2087       if (OperandIdx[i] < 0)
2088         return Node;
2089       SDValue &Src = Ops[OperandIdx[i] - 1];
2090       SDValue &Neg = Ops[NegIdx[i] - 1];
2091       SDValue &Abs = Ops[AbsIdx[i] - 1];
2092       bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1;
2093       int SelIdx = TII->getSelIdx(Opcode, OperandIdx[i]);
2094       if (HasDst)
2095         SelIdx--;
2096       SDValue &Sel = (SelIdx > -1) ? Ops[SelIdx] : FakeOp;
2097       if (FoldOperand(Node, i, Src, Neg, Abs, Sel, FakeOp, DAG))
2098         return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
2099     }
2100   } else if (Opcode == AMDGPU::REG_SEQUENCE) {
2101     for (unsigned i = 1, e = Node->getNumOperands(); i < e; i += 2) {
2102       SDValue &Src = Ops[i];
2103       if (FoldOperand(Node, i, Src, FakeOp, FakeOp, FakeOp, FakeOp, DAG))
2104         return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
2105     }
2106   } else if (Opcode == AMDGPU::CLAMP_R600) {
2107     SDValue Src = Node->getOperand(0);
2108     if (!Src.isMachineOpcode() ||
2109         !TII->hasInstrModifiers(Src.getMachineOpcode()))
2110       return Node;
2111     int ClampIdx = TII->getOperandIdx(Src.getMachineOpcode(),
2112         AMDGPU::OpName::clamp);
2113     if (ClampIdx < 0)
2114       return Node;
2115     std::vector<SDValue> Ops;
2116     unsigned NumOp = Src.getNumOperands();
2117     for(unsigned i = 0; i < NumOp; ++i)
2118           Ops.push_back(Src.getOperand(i));
2119     Ops[ClampIdx - 1] = DAG.getTargetConstant(1, MVT::i32);
2120     return DAG.getMachineNode(Src.getMachineOpcode(), SDLoc(Node),
2121         Node->getVTList(), Ops);
2122   } else {
2123     if (!TII->hasInstrModifiers(Opcode))
2124       return Node;
2125     int OperandIdx[] = {
2126       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0),
2127       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1),
2128       TII->getOperandIdx(Opcode, AMDGPU::OpName::src2)
2129     };
2130     int NegIdx[] = {
2131       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg),
2132       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg),
2133       TII->getOperandIdx(Opcode, AMDGPU::OpName::src2_neg)
2134     };
2135     int AbsIdx[] = {
2136       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs),
2137       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs),
2138       -1
2139     };
2140     for (unsigned i = 0; i < 3; i++) {
2141       if (OperandIdx[i] < 0)
2142         return Node;
2143       SDValue &Src = Ops[OperandIdx[i] - 1];
2144       SDValue &Neg = Ops[NegIdx[i] - 1];
2145       SDValue FakeAbs;
2146       SDValue &Abs = (AbsIdx[i] > -1) ? Ops[AbsIdx[i] - 1] : FakeAbs;
2147       bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1;
2148       int SelIdx = TII->getSelIdx(Opcode, OperandIdx[i]);
2149       int ImmIdx = TII->getOperandIdx(Opcode, AMDGPU::OpName::literal);
2150       if (HasDst) {
2151         SelIdx--;
2152         ImmIdx--;
2153       }
2154       SDValue &Sel = (SelIdx > -1) ? Ops[SelIdx] : FakeOp;
2155       SDValue &Imm = Ops[ImmIdx];
2156       if (FoldOperand(Node, i, Src, Neg, Abs, Sel, Imm, DAG))
2157         return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
2158     }
2159   }
2160
2161   return Node;
2162 }