lib/Target/R600/R600ISelLowering.cpp

   1 //===-- R600ISelLowering.cpp - R600 DAG Lowering Implementation -----------===//
   2 //
   3 //                     The LLVM Compiler Infrastructure
   4 //
   5 // This file is distributed under the University of Illinois Open Source
   6 // License. See LICENSE.TXT for details.
   7 //
   8 //===----------------------------------------------------------------------===//
   9 //
  10 /// \file
  11 /// \brief Custom DAG lowering for R600
  12 //
  13 //===----------------------------------------------------------------------===//
  14
  15 #include "R600ISelLowering.h"
  16 #include "AMDGPUFrameLowering.h"
  17 #include "AMDGPUIntrinsicInfo.h"
  18 #include "AMDGPUSubtarget.h"
  19 #include "R600Defines.h"
  20 #include "R600InstrInfo.h"
  21 #include "R600MachineFunctionInfo.h"
  22 #include "llvm/Analysis/ValueTracking.h"
  23 #include "llvm/CodeGen/CallingConvLower.h"
  24 #include "llvm/CodeGen/MachineFrameInfo.h"
  25 #include "llvm/CodeGen/MachineInstrBuilder.h"
  26 #include "llvm/CodeGen/MachineRegisterInfo.h"
  27 #include "llvm/CodeGen/SelectionDAG.h"
  28 #include "llvm/IR/Argument.h"
  29 #include "llvm/IR/Function.h"
  30
  31 using namespace llvm;
  32
  33 R600TargetLowering::R600TargetLowering(TargetMachine &TM) :
  34     AMDGPUTargetLowering(TM),
  35     Gen(TM.getSubtarget<AMDGPUSubtarget>().getGeneration()) {
  36   addRegisterClass(MVT::v4f32, &AMDGPU::R600_Reg128RegClass);
  37   addRegisterClass(MVT::f32, &AMDGPU::R600_Reg32RegClass);
  38   addRegisterClass(MVT::v4i32, &AMDGPU::R600_Reg128RegClass);
  39   addRegisterClass(MVT::i32, &AMDGPU::R600_Reg32RegClass);
  40   addRegisterClass(MVT::v2f32, &AMDGPU::R600_Reg64RegClass);
  41   addRegisterClass(MVT::v2i32, &AMDGPU::R600_Reg64RegClass);
  42
  43   computeRegisterProperties();
  44
  45   // Set condition code actions
  46   setCondCodeAction(ISD::SETO,   MVT::f32, Expand);
  47   setCondCodeAction(ISD::SETUO,  MVT::f32, Expand);
  48   setCondCodeAction(ISD::SETLT,  MVT::f32, Expand);
  49   setCondCodeAction(ISD::SETLE,  MVT::f32, Expand);
  50   setCondCodeAction(ISD::SETOLT, MVT::f32, Expand);
  51   setCondCodeAction(ISD::SETOLE, MVT::f32, Expand);
  52   setCondCodeAction(ISD::SETONE, MVT::f32, Expand);
  53   setCondCodeAction(ISD::SETUEQ, MVT::f32, Expand);
  54   setCondCodeAction(ISD::SETUGE, MVT::f32, Expand);
  55   setCondCodeAction(ISD::SETUGT, MVT::f32, Expand);
  56   setCondCodeAction(ISD::SETULT, MVT::f32, Expand);
  57   setCondCodeAction(ISD::SETULE, MVT::f32, Expand);
  58
  59   setCondCodeAction(ISD::SETLE, MVT::i32, Expand);
  60   setCondCodeAction(ISD::SETLT, MVT::i32, Expand);
  61   setCondCodeAction(ISD::SETULE, MVT::i32, Expand);
  62   setCondCodeAction(ISD::SETULT, MVT::i32, Expand);
  63
  64   setOperationAction(ISD::FCOS, MVT::f32, Custom);
  65   setOperationAction(ISD::FSIN, MVT::f32, Custom);
  66
  67   setOperationAction(ISD::SETCC, MVT::v4i32, Expand);
  68   setOperationAction(ISD::SETCC, MVT::v2i32, Expand);
  69
  70   setOperationAction(ISD::BR_CC, MVT::i32, Expand);
  71   setOperationAction(ISD::BR_CC, MVT::f32, Expand);
  72   setOperationAction(ISD::BRCOND, MVT::Other, Custom);
  73
  74   setOperationAction(ISD::FSUB, MVT::f32, Expand);
  75
  76   setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
  77   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
  78   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i1, Custom);
  79
  80   setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
  81   setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
  82
  83   setOperationAction(ISD::SETCC, MVT::i32, Expand);
  84   setOperationAction(ISD::SETCC, MVT::f32, Expand);
  85   setOperationAction(ISD::FP_TO_UINT, MVT::i1, Custom);
  86   setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
  87   setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);
  88
  89   setOperationAction(ISD::SELECT, MVT::i32, Expand);
  90   setOperationAction(ISD::SELECT, MVT::f32, Expand);
  91   setOperationAction(ISD::SELECT, MVT::v2i32, Expand);
  92   setOperationAction(ISD::SELECT, MVT::v4i32, Expand);
  93
  94   // Expand sign extension of vectors
  95   if (!Subtarget->hasBFE())
  96     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
  97
  98   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i1, Expand);
  99   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i1, Expand);
 100
 101   if (!Subtarget->hasBFE())
 102     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8, Expand);
 103   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i8, Expand);
 104   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i8, Expand);
 105
 106   if (!Subtarget->hasBFE())
 107     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Expand);
 108   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Expand);
 109   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Expand);
 110
 111   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
 112   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i32, Expand);
 113   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i32, Expand);
 114
 115   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::Other, Expand);
 116
 117
 118   // Legalize loads and stores to the private address space.
 119   setOperationAction(ISD::LOAD, MVT::i32, Custom);
 120   setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
 121   setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
 122
 123   // EXTLOAD should be the same as ZEXTLOAD. It is legal for some address
 124   // spaces, so it is custom lowered to handle those where it isn't.
 125   setLoadExtAction(ISD::SEXTLOAD, MVT::i8, Custom);
 126   setLoadExtAction(ISD::SEXTLOAD, MVT::i16, Custom);
 127   setLoadExtAction(ISD::ZEXTLOAD, MVT::i8, Custom);
 128   setLoadExtAction(ISD::ZEXTLOAD, MVT::i16, Custom);
 129   setLoadExtAction(ISD::EXTLOAD, MVT::i8, Custom);
 130   setLoadExtAction(ISD::EXTLOAD, MVT::i16, Custom);
 131
 132   setOperationAction(ISD::STORE, MVT::i8, Custom);
 133   setOperationAction(ISD::STORE, MVT::i32, Custom);
 134   setOperationAction(ISD::STORE, MVT::v2i32, Custom);
 135   setOperationAction(ISD::STORE, MVT::v4i32, Custom);
 136   setTruncStoreAction(MVT::i32, MVT::i8, Custom);
 137   setTruncStoreAction(MVT::i32, MVT::i16, Custom);
 138
 139   setOperationAction(ISD::LOAD, MVT::i32, Custom);
 140   setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
 141   setOperationAction(ISD::FrameIndex, MVT::i32, Custom);
 142
 143   setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i32, Custom);
 144   setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f32, Custom);
 145   setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom);
 146   setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
 147
 148   setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i32, Custom);
 149   setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f32, Custom);
 150   setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom);
 151   setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom);
 152
 153   setTargetDAGCombine(ISD::FP_ROUND);
 154   setTargetDAGCombine(ISD::FP_TO_SINT);
 155   setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
 156   setTargetDAGCombine(ISD::SELECT_CC);
 157   setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
 158
 159   setOperationAction(ISD::SUB, MVT::i64, Expand);
 160
 161   // These should be replaced by UDVIREM, but it does not happen automatically
 162   // during Type Legalization
 163   setOperationAction(ISD::UDIV, MVT::i64, Custom);
 164   setOperationAction(ISD::UREM, MVT::i64, Custom);
 165   setOperationAction(ISD::SDIV, MVT::i64, Custom);
 166   setOperationAction(ISD::SREM, MVT::i64, Custom);
 167
 168   // We don't have 64-bit shifts. Thus we need either SHX i64 or SHX_PARTS i32
 169   //  to be Legal/Custom in order to avoid library calls.
 170   setOperationAction(ISD::SHL_PARTS, MVT::i32, Custom);
 171   setOperationAction(ISD::SRL_PARTS, MVT::i32, Custom);
 172   setOperationAction(ISD::SRA_PARTS, MVT::i32, Custom);
 173
 174   setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
 175
 176   const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 };
 177   for (MVT VT : ScalarIntVTs) {
 178     setOperationAction(ISD::ADDC, VT, Expand);
 179     setOperationAction(ISD::SUBC, VT, Expand);
 180     setOperationAction(ISD::ADDE, VT, Expand);
 181     setOperationAction(ISD::SUBE, VT, Expand);
 182   }
 183
 184   setBooleanContents(ZeroOrNegativeOneBooleanContent);
 185   setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
 186   setSchedulingPreference(Sched::Source);
 187 }
 188
 189 MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter(
 190     MachineInstr * MI, MachineBasicBlock * BB) const {
 191   MachineFunction * MF = BB->getParent();
 192   MachineRegisterInfo &MRI = MF->getRegInfo();
 193   MachineBasicBlock::iterator I = *MI;
 194   const R600InstrInfo *TII =
 195       static_cast<const R600InstrInfo *>(MF->getSubtarget().getInstrInfo());
 196
 197   switch (MI->getOpcode()) {
 198   default:
 199     // Replace LDS_*_RET instruction that don't have any uses with the
 200     // equivalent LDS_*_NORET instruction.
 201     if (TII->isLDSRetInstr(MI->getOpcode())) {
 202       int DstIdx = TII->getOperandIdx(MI->getOpcode(), AMDGPU::OpName::dst);
 203       assert(DstIdx != -1);
 204       MachineInstrBuilder NewMI;
 205       // FIXME: getLDSNoRetOp method only handles LDS_1A1D LDS ops. Add
 206       //        LDS_1A2D support and remove this special case.
 207       if (!MRI.use_empty(MI->getOperand(DstIdx).getReg()) ||
 208            MI->getOpcode() == AMDGPU::LDS_CMPST_RET)
 209         return BB;
 210
 211       NewMI = BuildMI(*BB, I, BB->findDebugLoc(I),
 212                       TII->get(AMDGPU::getLDSNoRetOp(MI->getOpcode())));
 213       for (unsigned i = 1, e = MI->getNumOperands(); i < e; ++i) {
 214         NewMI.addOperand(MI->getOperand(i));
 215       }
 216     } else {
 217       return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
 218     }
 219     break;
 220   case AMDGPU::CLAMP_R600: {
 221     MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
 222                                                    AMDGPU::MOV,
 223                                                    MI->getOperand(0).getReg(),
 224                                                    MI->getOperand(1).getReg());
 225     TII->addFlag(NewMI, 0, MO_FLAG_CLAMP);
 226     break;
 227   }
 228
 229   case AMDGPU::FABS_R600: {
 230     MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
 231                                                     AMDGPU::MOV,
 232                                                     MI->getOperand(0).getReg(),
 233                                                     MI->getOperand(1).getReg());
 234     TII->addFlag(NewMI, 0, MO_FLAG_ABS);
 235     break;
 236   }
 237
 238   case AMDGPU::FNEG_R600: {
 239     MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
 240                                                     AMDGPU::MOV,
 241                                                     MI->getOperand(0).getReg(),
 242                                                     MI->getOperand(1).getReg());
 243     TII->addFlag(NewMI, 0, MO_FLAG_NEG);
 244     break;
 245   }
 246
 247   case AMDGPU::MASK_WRITE: {
 248     unsigned maskedRegister = MI->getOperand(0).getReg();
 249     assert(TargetRegisterInfo::isVirtualRegister(maskedRegister));
 250     MachineInstr * defInstr = MRI.getVRegDef(maskedRegister);
 251     TII->addFlag(defInstr, 0, MO_FLAG_MASK);
 252     break;
 253   }
 254
 255   case AMDGPU::MOV_IMM_F32:
 256     TII->buildMovImm(*BB, I, MI->getOperand(0).getReg(),
 257                      MI->getOperand(1).getFPImm()->getValueAPF()
 258                          .bitcastToAPInt().getZExtValue());
 259     break;
 260   case AMDGPU::MOV_IMM_I32:
 261     TII->buildMovImm(*BB, I, MI->getOperand(0).getReg(),
 262                      MI->getOperand(1).getImm());
 263     break;
 264   case AMDGPU::CONST_COPY: {
 265     MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, MI, AMDGPU::MOV,
 266         MI->getOperand(0).getReg(), AMDGPU::ALU_CONST);
 267     TII->setImmOperand(NewMI, AMDGPU::OpName::src0_sel,
 268         MI->getOperand(1).getImm());
 269     break;
 270   }
 271
 272   case AMDGPU::RAT_WRITE_CACHELESS_32_eg:
 273   case AMDGPU::RAT_WRITE_CACHELESS_64_eg:
 274   case AMDGPU::RAT_WRITE_CACHELESS_128_eg: {
 275     unsigned EOP = (std::next(I)->getOpcode() == AMDGPU::RETURN) ? 1 : 0;
 276
 277     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode()))
 278             .addOperand(MI->getOperand(0))
 279             .addOperand(MI->getOperand(1))
 280             .addImm(EOP); // Set End of program bit
 281     break;
 282   }
 283
 284   case AMDGPU::TXD: {
 285     unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
 286     unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
 287     MachineOperand &RID = MI->getOperand(4);
 288     MachineOperand &SID = MI->getOperand(5);
 289     unsigned TextureId = MI->getOperand(6).getImm();
 290     unsigned SrcX = 0, SrcY = 1, SrcZ = 2, SrcW = 3;
 291     unsigned CTX = 1, CTY = 1, CTZ = 1, CTW = 1;
 292
 293     switch (TextureId) {
 294     case 5: // Rect
 295       CTX = CTY = 0;
 296       break;
 297     case 6: // Shadow1D
 298       SrcW = SrcZ;
 299       break;
 300     case 7: // Shadow2D
 301       SrcW = SrcZ;
 302       break;
 303     case 8: // ShadowRect
 304       CTX = CTY = 0;
 305       SrcW = SrcZ;
 306       break;
 307     case 9: // 1DArray
 308       SrcZ = SrcY;
 309       CTZ = 0;
 310       break;
 311     case 10: // 2DArray
 312       CTZ = 0;
 313       break;
 314     case 11: // Shadow1DArray
 315       SrcZ = SrcY;
 316       CTZ = 0;
 317       break;
 318     case 12: // Shadow2DArray
 319       CTZ = 0;
 320       break;
 321     }
 322     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), T0)
 323             .addOperand(MI->getOperand(3))
 324             .addImm(SrcX)
 325             .addImm(SrcY)
 326             .addImm(SrcZ)
 327             .addImm(SrcW)
 328             .addImm(0)
 329             .addImm(0)
 330             .addImm(0)
 331             .addImm(0)
 332             .addImm(1)
 333             .addImm(2)
 334             .addImm(3)
 335             .addOperand(RID)
 336             .addOperand(SID)
 337             .addImm(CTX)
 338             .addImm(CTY)
 339             .addImm(CTZ)
 340             .addImm(CTW);
 341     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), T1)
 342             .addOperand(MI->getOperand(2))
 343             .addImm(SrcX)
 344             .addImm(SrcY)
 345             .addImm(SrcZ)
 346             .addImm(SrcW)
 347             .addImm(0)
 348             .addImm(0)
 349             .addImm(0)
 350             .addImm(0)
 351             .addImm(1)
 352             .addImm(2)
 353             .addImm(3)
 354             .addOperand(RID)
 355             .addOperand(SID)
 356             .addImm(CTX)
 357             .addImm(CTY)
 358             .addImm(CTZ)
 359             .addImm(CTW);
 360     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_G))
 361             .addOperand(MI->getOperand(0))
 362             .addOperand(MI->getOperand(1))
 363             .addImm(SrcX)
 364             .addImm(SrcY)
 365             .addImm(SrcZ)
 366             .addImm(SrcW)
 367             .addImm(0)
 368             .addImm(0)
 369             .addImm(0)
 370             .addImm(0)
 371             .addImm(1)
 372             .addImm(2)
 373             .addImm(3)
 374             .addOperand(RID)
 375             .addOperand(SID)
 376             .addImm(CTX)
 377             .addImm(CTY)
 378             .addImm(CTZ)
 379             .addImm(CTW)
 380             .addReg(T0, RegState::Implicit)
 381             .addReg(T1, RegState::Implicit);
 382     break;
 383   }
 384
 385   case AMDGPU::TXD_SHADOW: {
 386     unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
 387     unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
 388     MachineOperand &RID = MI->getOperand(4);
 389     MachineOperand &SID = MI->getOperand(5);
 390     unsigned TextureId = MI->getOperand(6).getImm();
 391     unsigned SrcX = 0, SrcY = 1, SrcZ = 2, SrcW = 3;
 392     unsigned CTX = 1, CTY = 1, CTZ = 1, CTW = 1;
 393
 394     switch (TextureId) {
 395     case 5: // Rect
 396       CTX = CTY = 0;
 397       break;
 398     case 6: // Shadow1D
 399       SrcW = SrcZ;
 400       break;
 401     case 7: // Shadow2D
 402       SrcW = SrcZ;
 403       break;
 404     case 8: // ShadowRect
 405       CTX = CTY = 0;
 406       SrcW = SrcZ;
 407       break;
 408     case 9: // 1DArray
 409       SrcZ = SrcY;
 410       CTZ = 0;
 411       break;
 412     case 10: // 2DArray
 413       CTZ = 0;
 414       break;
 415     case 11: // Shadow1DArray
 416       SrcZ = SrcY;
 417       CTZ = 0;
 418       break;
 419     case 12: // Shadow2DArray
 420       CTZ = 0;
 421       break;
 422     }
 423
 424     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), T0)
 425             .addOperand(MI->getOperand(3))
 426             .addImm(SrcX)
 427             .addImm(SrcY)
 428             .addImm(SrcZ)
 429             .addImm(SrcW)
 430             .addImm(0)
 431             .addImm(0)
 432             .addImm(0)
 433             .addImm(0)
 434             .addImm(1)
 435             .addImm(2)
 436             .addImm(3)
 437             .addOperand(RID)
 438             .addOperand(SID)
 439             .addImm(CTX)
 440             .addImm(CTY)
 441             .addImm(CTZ)
 442             .addImm(CTW);
 443     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), T1)
 444             .addOperand(MI->getOperand(2))
 445             .addImm(SrcX)
 446             .addImm(SrcY)
 447             .addImm(SrcZ)
 448             .addImm(SrcW)
 449             .addImm(0)
 450             .addImm(0)
 451             .addImm(0)
 452             .addImm(0)
 453             .addImm(1)
 454             .addImm(2)
 455             .addImm(3)
 456             .addOperand(RID)
 457             .addOperand(SID)
 458             .addImm(CTX)
 459             .addImm(CTY)
 460             .addImm(CTZ)
 461             .addImm(CTW);
 462     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_C_G))
 463             .addOperand(MI->getOperand(0))
 464             .addOperand(MI->getOperand(1))
 465             .addImm(SrcX)
 466             .addImm(SrcY)
 467             .addImm(SrcZ)
 468             .addImm(SrcW)
 469             .addImm(0)
 470             .addImm(0)
 471             .addImm(0)
 472             .addImm(0)
 473             .addImm(1)
 474             .addImm(2)
 475             .addImm(3)
 476             .addOperand(RID)
 477             .addOperand(SID)
 478             .addImm(CTX)
 479             .addImm(CTY)
 480             .addImm(CTZ)
 481             .addImm(CTW)
 482             .addReg(T0, RegState::Implicit)
 483             .addReg(T1, RegState::Implicit);
 484     break;
 485   }
 486
 487   case AMDGPU::BRANCH:
 488       BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP))
 489               .addOperand(MI->getOperand(0));
 490       break;
 491
 492   case AMDGPU::BRANCH_COND_f32: {
 493     MachineInstr *NewMI =
 494       BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X),
 495               AMDGPU::PREDICATE_BIT)
 496               .addOperand(MI->getOperand(1))
 497               .addImm(OPCODE_IS_NOT_ZERO)
 498               .addImm(0); // Flags
 499     TII->addFlag(NewMI, 0, MO_FLAG_PUSH);
 500     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND))
 501             .addOperand(MI->getOperand(0))
 502             .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
 503     break;
 504   }
 505
 506   case AMDGPU::BRANCH_COND_i32: {
 507     MachineInstr *NewMI =
 508       BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X),
 509             AMDGPU::PREDICATE_BIT)
 510             .addOperand(MI->getOperand(1))
 511             .addImm(OPCODE_IS_NOT_ZERO_INT)
 512             .addImm(0); // Flags
 513     TII->addFlag(NewMI, 0, MO_FLAG_PUSH);
 514     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND))
 515            .addOperand(MI->getOperand(0))
 516             .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
 517     break;
 518   }
 519
 520   case AMDGPU::EG_ExportSwz:
 521   case AMDGPU::R600_ExportSwz: {
 522     // Instruction is left unmodified if its not the last one of its type
 523     bool isLastInstructionOfItsType = true;
 524     unsigned InstExportType = MI->getOperand(1).getImm();
 525     for (MachineBasicBlock::iterator NextExportInst = std::next(I),
 526          EndBlock = BB->end(); NextExportInst != EndBlock;
 527          NextExportInst = std::next(NextExportInst)) {
 528       if (NextExportInst->getOpcode() == AMDGPU::EG_ExportSwz ||
 529           NextExportInst->getOpcode() == AMDGPU::R600_ExportSwz) {
 530         unsigned CurrentInstExportType = NextExportInst->getOperand(1)
 531             .getImm();
 532         if (CurrentInstExportType == InstExportType) {
 533           isLastInstructionOfItsType = false;
 534           break;
 535         }
 536       }
 537     }
 538     bool EOP = (std::next(I)->getOpcode() == AMDGPU::RETURN) ? 1 : 0;
 539     if (!EOP && !isLastInstructionOfItsType)
 540       return BB;
 541     unsigned CfInst = (MI->getOpcode() == AMDGPU::EG_ExportSwz)? 84 : 40;
 542     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode()))
 543             .addOperand(MI->getOperand(0))
 544             .addOperand(MI->getOperand(1))
 545             .addOperand(MI->getOperand(2))
 546             .addOperand(MI->getOperand(3))
 547             .addOperand(MI->getOperand(4))
 548             .addOperand(MI->getOperand(5))
 549             .addOperand(MI->getOperand(6))
 550             .addImm(CfInst)
 551             .addImm(EOP);
 552     break;
 553   }
 554   case AMDGPU::RETURN: {
 555     // RETURN instructions must have the live-out registers as implicit uses,
 556     // otherwise they appear dead.
 557     R600MachineFunctionInfo *MFI = MF->getInfo<R600MachineFunctionInfo>();
 558     MachineInstrBuilder MIB(*MF, MI);
 559     for (unsigned i = 0, e = MFI->LiveOuts.size(); i != e; ++i)
 560       MIB.addReg(MFI->LiveOuts[i], RegState::Implicit);
 561     return BB;
 562   }
 563   }
 564
 565   MI->eraseFromParent();
 566   return BB;
 567 }
 568
 569 //===----------------------------------------------------------------------===//
 570 // Custom DAG Lowering Operations
 571 //===----------------------------------------------------------------------===//
 572
 573 SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
 574   MachineFunction &MF = DAG.getMachineFunction();
 575   R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
 576   switch (Op.getOpcode()) {
 577   default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
 578   case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
 579   case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
 580   case ISD::SHL_PARTS: return LowerSHLParts(Op, DAG);
 581   case ISD::SRA_PARTS:
 582   case ISD::SRL_PARTS: return LowerSRXParts(Op, DAG);
 583   case ISD::FCOS:
 584   case ISD::FSIN: return LowerTrig(Op, DAG);
 585   case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
 586   case ISD::STORE: return LowerSTORE(Op, DAG);
 587   case ISD::LOAD: {
 588     SDValue Result = LowerLOAD(Op, DAG);
 589     assert((!Result.getNode() ||
 590             Result.getNode()->getNumValues() == 2) &&
 591            "Load should return a value and a chain");
 592     return Result;
 593   }
 594
 595   case ISD::BRCOND: return LowerBRCOND(Op, DAG);
 596   case ISD::GlobalAddress: return LowerGlobalAddress(MFI, Op, DAG);
 597   case ISD::INTRINSIC_VOID: {
 598     SDValue Chain = Op.getOperand(0);
 599     unsigned IntrinsicID =
 600                          cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
 601     switch (IntrinsicID) {
 602     case AMDGPUIntrinsic::AMDGPU_store_output: {
 603       int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
 604       unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister(RegIndex);
 605       MFI->LiveOuts.push_back(Reg);
 606       return DAG.getCopyToReg(Chain, SDLoc(Op), Reg, Op.getOperand(2));
 607     }
 608     case AMDGPUIntrinsic::R600_store_swizzle: {
 609       const SDValue Args[8] = {
 610         Chain,
 611         Op.getOperand(2), // Export Value
 612         Op.getOperand(3), // ArrayBase
 613         Op.getOperand(4), // Type
 614         DAG.getConstant(0, MVT::i32), // SWZ_X
 615         DAG.getConstant(1, MVT::i32), // SWZ_Y
 616         DAG.getConstant(2, MVT::i32), // SWZ_Z
 617         DAG.getConstant(3, MVT::i32) // SWZ_W
 618       };
 619       return DAG.getNode(AMDGPUISD::EXPORT, SDLoc(Op), Op.getValueType(), Args);
 620     }
 621
 622     // default for switch(IntrinsicID)
 623     default: break;
 624     }
 625     // break out of case ISD::INTRINSIC_VOID in switch(Op.getOpcode())
 626     break;
 627   }
 628   case ISD::INTRINSIC_WO_CHAIN: {
 629     unsigned IntrinsicID =
 630                          cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
 631     EVT VT = Op.getValueType();
 632     SDLoc DL(Op);
 633     switch(IntrinsicID) {
 634     default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
 635     case AMDGPUIntrinsic::R600_load_input: {
 636       int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
 637       unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister(RegIndex);
 638       MachineFunction &MF = DAG.getMachineFunction();
 639       MachineRegisterInfo &MRI = MF.getRegInfo();
 640       MRI.addLiveIn(Reg);
 641       return DAG.getCopyFromReg(DAG.getEntryNode(),
 642           SDLoc(DAG.getEntryNode()), Reg, VT);
 643     }
 644
 645     case AMDGPUIntrinsic::R600_interp_input: {
 646       int slot = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
 647       int ijb = cast<ConstantSDNode>(Op.getOperand(2))->getSExtValue();
 648       MachineSDNode *interp;
 649       if (ijb < 0) {
 650         const MachineFunction &MF = DAG.getMachineFunction();
 651         const R600InstrInfo *TII = static_cast<const R600InstrInfo *>(
 652             MF.getSubtarget().getInstrInfo());
 653         interp = DAG.getMachineNode(AMDGPU::INTERP_VEC_LOAD, DL,
 654             MVT::v4f32, DAG.getTargetConstant(slot / 4 , MVT::i32));
 655         return DAG.getTargetExtractSubreg(
 656             TII->getRegisterInfo().getSubRegFromChannel(slot % 4),
 657             DL, MVT::f32, SDValue(interp, 0));
 658       }
 659       MachineFunction &MF = DAG.getMachineFunction();
 660       MachineRegisterInfo &MRI = MF.getRegInfo();
 661       unsigned RegisterI = AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb);
 662       unsigned RegisterJ = AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb + 1);
 663       MRI.addLiveIn(RegisterI);
 664       MRI.addLiveIn(RegisterJ);
 665       SDValue RegisterINode = DAG.getCopyFromReg(DAG.getEntryNode(),
 666           SDLoc(DAG.getEntryNode()), RegisterI, MVT::f32);
 667       SDValue RegisterJNode = DAG.getCopyFromReg(DAG.getEntryNode(),
 668           SDLoc(DAG.getEntryNode()), RegisterJ, MVT::f32);
 669
 670       if (slot % 4 < 2)
 671         interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_XY, DL,
 672             MVT::f32, MVT::f32, DAG.getTargetConstant(slot / 4 , MVT::i32),
 673             RegisterJNode, RegisterINode);
 674       else
 675         interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_ZW, DL,
 676             MVT::f32, MVT::f32, DAG.getTargetConstant(slot / 4 , MVT::i32),
 677             RegisterJNode, RegisterINode);
 678       return SDValue(interp, slot % 2);
 679     }
 680     case AMDGPUIntrinsic::R600_interp_xy:
 681     case AMDGPUIntrinsic::R600_interp_zw: {
 682       int slot = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
 683       MachineSDNode *interp;
 684       SDValue RegisterINode = Op.getOperand(2);
 685       SDValue RegisterJNode = Op.getOperand(3);
 686
 687       if (IntrinsicID == AMDGPUIntrinsic::R600_interp_xy)
 688         interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_XY, DL,
 689             MVT::f32, MVT::f32, DAG.getTargetConstant(slot, MVT::i32),
 690             RegisterJNode, RegisterINode);
 691       else
 692         interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_ZW, DL,
 693             MVT::f32, MVT::f32, DAG.getTargetConstant(slot, MVT::i32),
 694             RegisterJNode, RegisterINode);
 695       return DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2f32,
 696           SDValue(interp, 0), SDValue(interp, 1));
 697     }
 698     case AMDGPUIntrinsic::R600_tex:
 699     case AMDGPUIntrinsic::R600_texc:
 700     case AMDGPUIntrinsic::R600_txl:
 701     case AMDGPUIntrinsic::R600_txlc:
 702     case AMDGPUIntrinsic::R600_txb:
 703     case AMDGPUIntrinsic::R600_txbc:
 704     case AMDGPUIntrinsic::R600_txf:
 705     case AMDGPUIntrinsic::R600_txq:
 706     case AMDGPUIntrinsic::R600_ddx:
 707     case AMDGPUIntrinsic::R600_ddy:
 708     case AMDGPUIntrinsic::R600_ldptr: {
 709       unsigned TextureOp;
 710       switch (IntrinsicID) {
 711       case AMDGPUIntrinsic::R600_tex:
 712         TextureOp = 0;
 713         break;
 714       case AMDGPUIntrinsic::R600_texc:
 715         TextureOp = 1;
 716         break;
 717       case AMDGPUIntrinsic::R600_txl:
 718         TextureOp = 2;
 719         break;
 720       case AMDGPUIntrinsic::R600_txlc:
 721         TextureOp = 3;
 722         break;
 723       case AMDGPUIntrinsic::R600_txb:
 724         TextureOp = 4;
 725         break;
 726       case AMDGPUIntrinsic::R600_txbc:
 727         TextureOp = 5;
 728         break;
 729       case AMDGPUIntrinsic::R600_txf:
 730         TextureOp = 6;
 731         break;
 732       case AMDGPUIntrinsic::R600_txq:
 733         TextureOp = 7;
 734         break;
 735       case AMDGPUIntrinsic::R600_ddx:
 736         TextureOp = 8;
 737         break;
 738       case AMDGPUIntrinsic::R600_ddy:
 739         TextureOp = 9;
 740         break;
 741       case AMDGPUIntrinsic::R600_ldptr:
 742         TextureOp = 10;
 743         break;
 744       default:
 745         llvm_unreachable("Unknow Texture Operation");
 746       }
 747
 748       SDValue TexArgs[19] = {
 749         DAG.getConstant(TextureOp, MVT::i32),
 750         Op.getOperand(1),
 751         DAG.getConstant(0, MVT::i32),
 752         DAG.getConstant(1, MVT::i32),
 753         DAG.getConstant(2, MVT::i32),
 754         DAG.getConstant(3, MVT::i32),
 755         Op.getOperand(2),
 756         Op.getOperand(3),
 757         Op.getOperand(4),
 758         DAG.getConstant(0, MVT::i32),
 759         DAG.getConstant(1, MVT::i32),
 760         DAG.getConstant(2, MVT::i32),
 761         DAG.getConstant(3, MVT::i32),
 762         Op.getOperand(5),
 763         Op.getOperand(6),
 764         Op.getOperand(7),
 765         Op.getOperand(8),
 766         Op.getOperand(9),
 767         Op.getOperand(10)
 768       };
 769       return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, DL, MVT::v4f32, TexArgs);
 770     }
 771     case AMDGPUIntrinsic::AMDGPU_dp4: {
 772       SDValue Args[8] = {
 773       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
 774           DAG.getConstant(0, MVT::i32)),
 775       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
 776           DAG.getConstant(0, MVT::i32)),
 777       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
 778           DAG.getConstant(1, MVT::i32)),
 779       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
 780           DAG.getConstant(1, MVT::i32)),
 781       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
 782           DAG.getConstant(2, MVT::i32)),
 783       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
 784           DAG.getConstant(2, MVT::i32)),
 785       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
 786           DAG.getConstant(3, MVT::i32)),
 787       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
 788           DAG.getConstant(3, MVT::i32))
 789       };
 790       return DAG.getNode(AMDGPUISD::DOT4, DL, MVT::f32, Args);
 791     }
 792
 793     case Intrinsic::r600_read_ngroups_x:
 794       return LowerImplicitParameter(DAG, VT, DL, 0);
 795     case Intrinsic::r600_read_ngroups_y:
 796       return LowerImplicitParameter(DAG, VT, DL, 1);
 797     case Intrinsic::r600_read_ngroups_z:
 798       return LowerImplicitParameter(DAG, VT, DL, 2);
 799     case Intrinsic::r600_read_global_size_x:
 800       return LowerImplicitParameter(DAG, VT, DL, 3);
 801     case Intrinsic::r600_read_global_size_y:
 802       return LowerImplicitParameter(DAG, VT, DL, 4);
 803     case Intrinsic::r600_read_global_size_z:
 804       return LowerImplicitParameter(DAG, VT, DL, 5);
 805     case Intrinsic::r600_read_local_size_x:
 806       return LowerImplicitParameter(DAG, VT, DL, 6);
 807     case Intrinsic::r600_read_local_size_y:
 808       return LowerImplicitParameter(DAG, VT, DL, 7);
 809     case Intrinsic::r600_read_local_size_z:
 810       return LowerImplicitParameter(DAG, VT, DL, 8);
 811
 812     case Intrinsic::r600_read_tgid_x:
 813       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 814                                   AMDGPU::T1_X, VT);
 815     case Intrinsic::r600_read_tgid_y:
 816       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 817                                   AMDGPU::T1_Y, VT);
 818     case Intrinsic::r600_read_tgid_z:
 819       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 820                                   AMDGPU::T1_Z, VT);
 821     case Intrinsic::r600_read_tidig_x:
 822       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 823                                   AMDGPU::T0_X, VT);
 824     case Intrinsic::r600_read_tidig_y:
 825       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 826                                   AMDGPU::T0_Y, VT);
 827     case Intrinsic::r600_read_tidig_z:
 828       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 829                                   AMDGPU::T0_Z, VT);
 830     case Intrinsic::AMDGPU_rsq:
 831       // XXX - I'm assuming SI's RSQ_LEGACY matches R600's behavior.
 832       return DAG.getNode(AMDGPUISD::RSQ_LEGACY, DL, VT, Op.getOperand(1));
 833     }
 834     // break out of case ISD::INTRINSIC_WO_CHAIN in switch(Op.getOpcode())
 835     break;
 836   }
 837   } // end switch(Op.getOpcode())
 838   return SDValue();
 839 }
 840
 841 void R600TargetLowering::ReplaceNodeResults(SDNode *N,
 842                                             SmallVectorImpl<SDValue> &Results,
 843                                             SelectionDAG &DAG) const {
 844   switch (N->getOpcode()) {
 845   default:
 846     AMDGPUTargetLowering::ReplaceNodeResults(N, Results, DAG);
 847     return;
 848   case ISD::FP_TO_UINT:
 849     if (N->getValueType(0) == MVT::i1) {
 850       Results.push_back(LowerFPTOUINT(N->getOperand(0), DAG));
 851       return;
 852     }
 853     // Fall-through. Since we don't care about out of bounds values
 854     // we can use FP_TO_SINT for uints too. The DAGLegalizer code for uint
 855     // considers some extra cases which are not necessary here.
 856   case ISD::FP_TO_SINT: {
 857     SDValue Result;
 858     if (expandFP_TO_SINT(N, Result, DAG))
 859       Results.push_back(Result);
 860     return;
 861   }
 862   case ISD::UDIV: {
 863     SDValue Op = SDValue(N, 0);
 864     SDLoc DL(Op);
 865     EVT VT = Op.getValueType();
 866     SDValue UDIVREM = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(VT, VT),
 867       N->getOperand(0), N->getOperand(1));
 868     Results.push_back(UDIVREM);
 869     break;
 870   }
 871   case ISD::UREM: {
 872     SDValue Op = SDValue(N, 0);
 873     SDLoc DL(Op);
 874     EVT VT = Op.getValueType();
 875     SDValue UDIVREM = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(VT, VT),
 876       N->getOperand(0), N->getOperand(1));
 877     Results.push_back(UDIVREM.getValue(1));
 878     break;
 879   }
 880   case ISD::SDIV: {
 881     SDValue Op = SDValue(N, 0);
 882     SDLoc DL(Op);
 883     EVT VT = Op.getValueType();
 884     SDValue SDIVREM = DAG.getNode(ISD::SDIVREM, DL, DAG.getVTList(VT, VT),
 885       N->getOperand(0), N->getOperand(1));
 886     Results.push_back(SDIVREM);
 887     break;
 888   }
 889   case ISD::SREM: {
 890     SDValue Op = SDValue(N, 0);
 891     SDLoc DL(Op);
 892     EVT VT = Op.getValueType();
 893     SDValue SDIVREM = DAG.getNode(ISD::SDIVREM, DL, DAG.getVTList(VT, VT),
 894       N->getOperand(0), N->getOperand(1));
 895     Results.push_back(SDIVREM.getValue(1));
 896     break;
 897   }
 898   case ISD::SDIVREM: {
 899     SDValue Op = SDValue(N, 1);
 900     SDValue RES = LowerSDIVREM(Op, DAG);
 901     Results.push_back(RES);
 902     Results.push_back(RES.getValue(1));
 903     break;
 904   }
 905   case ISD::UDIVREM: {
 906     SDValue Op = SDValue(N, 0);
 907     SDLoc DL(Op);
 908     EVT VT = Op.getValueType();
 909     EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());
 910
 911     SDValue one = DAG.getConstant(1, HalfVT);
 912     SDValue zero = DAG.getConstant(0, HalfVT);
 913
 914     //HiLo split
 915     SDValue LHS = N->getOperand(0);
 916     SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, zero);
 917     SDValue LHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, one);
 918
 919     SDValue RHS = N->getOperand(1);
 920     SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, zero);
 921     SDValue RHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, one);
 922
 923     // Get Speculative values
 924     SDValue DIV_Part = DAG.getNode(ISD::UDIV, DL, HalfVT, LHS_Hi, RHS_Lo);
 925     SDValue REM_Part = DAG.getNode(ISD::UREM, DL, HalfVT, LHS_Hi, RHS_Lo);
 926
 927     SDValue REM_Hi = zero;
 928     SDValue REM_Lo = DAG.getSelectCC(DL, RHS_Hi, zero, REM_Part, LHS_Hi, ISD::SETEQ);
 929
 930     SDValue DIV_Hi = DAG.getSelectCC(DL, RHS_Hi, zero, DIV_Part, zero, ISD::SETEQ);
 931     SDValue DIV_Lo = zero;
 932
 933     const unsigned halfBitWidth = HalfVT.getSizeInBits();
 934
 935     for (unsigned i = 0; i < halfBitWidth; ++i) {
 936       SDValue POS = DAG.getConstant(halfBitWidth - i - 1, HalfVT);
 937       // Get Value of high bit
 938       SDValue HBit;
 939       if (halfBitWidth == 32 && Subtarget->hasBFE()) {
 940         HBit = DAG.getNode(AMDGPUISD::BFE_U32, DL, HalfVT, LHS_Lo, POS, one);
 941       } else {
 942         HBit = DAG.getNode(ISD::SRL, DL, HalfVT, LHS_Lo, POS);
 943         HBit = DAG.getNode(ISD::AND, DL, HalfVT, HBit, one);
 944       }
 945
 946       SDValue Carry = DAG.getNode(ISD::SRL, DL, HalfVT, REM_Lo,
 947         DAG.getConstant(halfBitWidth - 1, HalfVT));
 948       REM_Hi = DAG.getNode(ISD::SHL, DL, HalfVT, REM_Hi, one);
 949       REM_Hi = DAG.getNode(ISD::OR, DL, HalfVT, REM_Hi, Carry);
 950
 951       REM_Lo = DAG.getNode(ISD::SHL, DL, HalfVT, REM_Lo, one);
 952       REM_Lo = DAG.getNode(ISD::OR, DL, HalfVT, REM_Lo, HBit);
 953
 954
 955       SDValue REM = DAG.getNode(ISD::BUILD_PAIR, DL, VT, REM_Lo, REM_Hi);
 956
 957       SDValue BIT = DAG.getConstant(1 << (halfBitWidth - i - 1), HalfVT);
 958       SDValue realBIT = DAG.getSelectCC(DL, REM, RHS, BIT, zero, ISD::SETGE);
 959
 960       DIV_Lo = DAG.getNode(ISD::OR, DL, HalfVT, DIV_Lo, realBIT);
 961
 962       // Update REM
 963
 964       SDValue REM_sub = DAG.getNode(ISD::SUB, DL, VT, REM, RHS);
 965
 966       REM = DAG.getSelectCC(DL, REM, RHS, REM_sub, REM, ISD::SETGE);
 967       REM_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, REM, zero);
 968       REM_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, REM, one);
 969     }
 970
 971     SDValue REM = DAG.getNode(ISD::BUILD_PAIR, DL, VT, REM_Lo, REM_Hi);
 972     SDValue DIV = DAG.getNode(ISD::BUILD_PAIR, DL, VT, DIV_Lo, DIV_Hi);
 973     Results.push_back(DIV);
 974     Results.push_back(REM);
 975     break;
 976   }
 977   }
 978 }
 979
 980 SDValue R600TargetLowering::vectorToVerticalVector(SelectionDAG &DAG,
 981                                                    SDValue Vector) const {
 982
 983   SDLoc DL(Vector);
 984   EVT VecVT = Vector.getValueType();
 985   EVT EltVT = VecVT.getVectorElementType();
 986   SmallVector<SDValue, 8> Args;
 987
 988   for (unsigned i = 0, e = VecVT.getVectorNumElements();
 989                                                            i != e; ++i) {
 990     Args.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT,
 991                                Vector, DAG.getConstant(i, getVectorIdxTy())));
 992   }
 993
 994   return DAG.getNode(AMDGPUISD::BUILD_VERTICAL_VECTOR, DL, VecVT, Args);
 995 }
 996
 997 SDValue R600TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
 998                                                     SelectionDAG &DAG) const {
 999
1000   SDLoc DL(Op);
1001   SDValue Vector = Op.getOperand(0);
1002   SDValue Index = Op.getOperand(1);
1003
1004   if (isa<ConstantSDNode>(Index) ||
1005       Vector.getOpcode() == AMDGPUISD::BUILD_VERTICAL_VECTOR)
1006     return Op;
1007
1008   Vector = vectorToVerticalVector(DAG, Vector);
1009   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Op.getValueType(),
1010                      Vector, Index);
1011 }
1012
1013 SDValue R600TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
1014                                                    SelectionDAG &DAG) const {
1015   SDLoc DL(Op);
1016   SDValue Vector = Op.getOperand(0);
1017   SDValue Value = Op.getOperand(1);
1018   SDValue Index = Op.getOperand(2);
1019
1020   if (isa<ConstantSDNode>(Index) ||
1021       Vector.getOpcode() == AMDGPUISD::BUILD_VERTICAL_VECTOR)
1022     return Op;
1023
1024   Vector = vectorToVerticalVector(DAG, Vector);
1025   SDValue Insert = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, Op.getValueType(),
1026                                Vector, Value, Index);
1027   return vectorToVerticalVector(DAG, Insert);
1028 }
1029
1030 SDValue R600TargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
1031   // On hw >= R700, COS/SIN input must be between -1. and 1.
1032   // Thus we lower them to TRIG ( FRACT ( x / 2Pi + 0.5) - 0.5)
1033   EVT VT = Op.getValueType();
1034   SDValue Arg = Op.getOperand(0);
1035   SDValue FractPart = DAG.getNode(AMDGPUISD::FRACT, SDLoc(Op), VT,
1036       DAG.getNode(ISD::FADD, SDLoc(Op), VT,
1037         DAG.getNode(ISD::FMUL, SDLoc(Op), VT, Arg,
1038           DAG.getConstantFP(0.15915494309, MVT::f32)),
1039         DAG.getConstantFP(0.5, MVT::f32)));
1040   unsigned TrigNode;
1041   switch (Op.getOpcode()) {
1042   case ISD::FCOS:
1043     TrigNode = AMDGPUISD::COS_HW;
1044     break;
1045   case ISD::FSIN:
1046     TrigNode = AMDGPUISD::SIN_HW;
1047     break;
1048   default:
1049     llvm_unreachable("Wrong trig opcode");
1050   }
1051   SDValue TrigVal = DAG.getNode(TrigNode, SDLoc(Op), VT,
1052       DAG.getNode(ISD::FADD, SDLoc(Op), VT, FractPart,
1053         DAG.getConstantFP(-0.5, MVT::f32)));
1054   if (Gen >= AMDGPUSubtarget::R700)
1055     return TrigVal;
1056   // On R600 hw, COS/SIN input must be between -Pi and Pi.
1057   return DAG.getNode(ISD::FMUL, SDLoc(Op), VT, TrigVal,
1058       DAG.getConstantFP(3.14159265359, MVT::f32));
1059 }
1060
1061 SDValue R600TargetLowering::LowerSHLParts(SDValue Op, SelectionDAG &DAG) const {
1062   SDLoc DL(Op);
1063   EVT VT = Op.getValueType();
1064
1065   SDValue Lo = Op.getOperand(0);
1066   SDValue Hi = Op.getOperand(1);
1067   SDValue Shift = Op.getOperand(2);
1068   SDValue Zero = DAG.getConstant(0, VT);
1069   SDValue One  = DAG.getConstant(1, VT);
1070
1071   SDValue Width  = DAG.getConstant(VT.getSizeInBits(), VT);
1072   SDValue Width1 = DAG.getConstant(VT.getSizeInBits() - 1, VT);
1073   SDValue BigShift  = DAG.getNode(ISD::SUB, DL, VT, Shift, Width);
1074   SDValue CompShift = DAG.getNode(ISD::SUB, DL, VT, Width1, Shift);
1075
1076   // The dance around Width1 is necessary for 0 special case.
1077   // Without it the CompShift might be 32, producing incorrect results in
1078   // Overflow. So we do the shift in two steps, the alternative is to
1079   // add a conditional to filter the special case.
1080
1081   SDValue Overflow = DAG.getNode(ISD::SRL, DL, VT, Lo, CompShift);
1082   Overflow = DAG.getNode(ISD::SRL, DL, VT, Overflow, One);
1083
1084   SDValue HiSmall = DAG.getNode(ISD::SHL, DL, VT, Hi, Shift);
1085   HiSmall = DAG.getNode(ISD::OR, DL, VT, HiSmall, Overflow);
1086   SDValue LoSmall = DAG.getNode(ISD::SHL, DL, VT, Lo, Shift);
1087
1088   SDValue HiBig = DAG.getNode(ISD::SHL, DL, VT, Lo, BigShift);
1089   SDValue LoBig = Zero;
1090
1091   Hi = DAG.getSelectCC(DL, Shift, Width, HiSmall, HiBig, ISD::SETULT);
1092   Lo = DAG.getSelectCC(DL, Shift, Width, LoSmall, LoBig, ISD::SETULT);
1093
1094   return DAG.getNode(ISD::MERGE_VALUES, DL, DAG.getVTList(VT,VT), Lo, Hi);
1095 }
1096
1097 SDValue R600TargetLowering::LowerSRXParts(SDValue Op, SelectionDAG &DAG) const {
1098   SDLoc DL(Op);
1099   EVT VT = Op.getValueType();
1100
1101   SDValue Lo = Op.getOperand(0);
1102   SDValue Hi = Op.getOperand(1);
1103   SDValue Shift = Op.getOperand(2);
1104   SDValue Zero = DAG.getConstant(0, VT);
1105   SDValue One  = DAG.getConstant(1, VT);
1106
1107   const bool SRA = Op.getOpcode() == ISD::SRA_PARTS;
1108
1109   SDValue Width  = DAG.getConstant(VT.getSizeInBits(), VT);
1110   SDValue Width1 = DAG.getConstant(VT.getSizeInBits() - 1, VT);
1111   SDValue BigShift  = DAG.getNode(ISD::SUB, DL, VT, Shift, Width);
1112   SDValue CompShift = DAG.getNode(ISD::SUB, DL, VT, Width1, Shift);
1113
1114   // The dance around Width1 is necessary for 0 special case.
1115   // Without it the CompShift might be 32, producing incorrect results in
1116   // Overflow. So we do the shift in two steps, the alternative is to
1117   // add a conditional to filter the special case.
1118
1119   SDValue Overflow = DAG.getNode(ISD::SHL, DL, VT, Hi, CompShift);
1120   Overflow = DAG.getNode(ISD::SHL, DL, VT, Overflow, One);
1121
1122   SDValue HiSmall = DAG.getNode(SRA ? ISD::SRA : ISD::SRL, DL, VT, Hi, Shift);
1123   SDValue LoSmall = DAG.getNode(ISD::SRL, DL, VT, Lo, Shift);
1124   LoSmall = DAG.getNode(ISD::OR, DL, VT, LoSmall, Overflow);
1125
1126   SDValue LoBig = DAG.getNode(SRA ? ISD::SRA : ISD::SRL, DL, VT, Hi, BigShift);
1127   SDValue HiBig = SRA ? DAG.getNode(ISD::SRA, DL, VT, Hi, Width1) : Zero;
1128
1129   Hi = DAG.getSelectCC(DL, Shift, Width, HiSmall, HiBig, ISD::SETULT);
1130   Lo = DAG.getSelectCC(DL, Shift, Width, LoSmall, LoBig, ISD::SETULT);
1131
1132   return DAG.getNode(ISD::MERGE_VALUES, DL, DAG.getVTList(VT,VT), Lo, Hi);
1133 }
1134
1135 SDValue R600TargetLowering::LowerFPTOUINT(SDValue Op, SelectionDAG &DAG) const {
1136   return DAG.getNode(
1137       ISD::SETCC,
1138       SDLoc(Op),
1139       MVT::i1,
1140       Op, DAG.getConstantFP(0.0f, MVT::f32),
1141       DAG.getCondCode(ISD::SETNE)
1142       );
1143 }
1144
1145 SDValue R600TargetLowering::LowerImplicitParameter(SelectionDAG &DAG, EVT VT,
1146                                                    SDLoc DL,
1147                                                    unsigned DwordOffset) const {
1148   unsigned ByteOffset = DwordOffset * 4;
1149   PointerType * PtrType = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
1150                                       AMDGPUAS::CONSTANT_BUFFER_0);
1151
1152   // We shouldn't be using an offset wider than 16-bits for implicit parameters.
1153   assert(isInt<16>(ByteOffset));
1154
1155   return DAG.getLoad(VT, DL, DAG.getEntryNode(),
1156                      DAG.getConstant(ByteOffset, MVT::i32), // PTR
1157                      MachinePointerInfo(ConstantPointerNull::get(PtrType)),
1158                      false, false, false, 0);
1159 }
1160
1161 bool R600TargetLowering::isZero(SDValue Op) const {
1162   if(ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
1163     return Cst->isNullValue();
1164   } else if(ConstantFPSDNode *CstFP = dyn_cast<ConstantFPSDNode>(Op)){
1165     return CstFP->isZero();
1166   } else {
1167     return false;
1168   }
1169 }
1170
1171 SDValue R600TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
1172   SDLoc DL(Op);
1173   EVT VT = Op.getValueType();
1174
1175   SDValue LHS = Op.getOperand(0);
1176   SDValue RHS = Op.getOperand(1);
1177   SDValue True = Op.getOperand(2);
1178   SDValue False = Op.getOperand(3);
1179   SDValue CC = Op.getOperand(4);
1180   SDValue Temp;
1181
1182   // LHS and RHS are guaranteed to be the same value type
1183   EVT CompareVT = LHS.getValueType();
1184
1185   // Check if we can lower this to a native operation.
1186
1187   // Try to lower to a SET* instruction:
1188   //
1189   // SET* can match the following patterns:
1190   //
1191   // select_cc f32, f32, -1,  0, cc_supported
1192   // select_cc f32, f32, 1.0f, 0.0f, cc_supported
1193   // select_cc i32, i32, -1,  0, cc_supported
1194   //
1195
1196   // Move hardware True/False values to the correct operand.
1197   ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
1198   ISD::CondCode InverseCC =
1199      ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32);
1200   if (isHWTrueValue(False) && isHWFalseValue(True)) {
1201     if (isCondCodeLegal(InverseCC, CompareVT.getSimpleVT())) {
1202       std::swap(False, True);
1203       CC = DAG.getCondCode(InverseCC);
1204     } else {
1205       ISD::CondCode SwapInvCC = ISD::getSetCCSwappedOperands(InverseCC);
1206       if (isCondCodeLegal(SwapInvCC, CompareVT.getSimpleVT())) {
1207         std::swap(False, True);
1208         std::swap(LHS, RHS);
1209         CC = DAG.getCondCode(SwapInvCC);
1210       }
1211     }
1212   }
1213
1214   if (isHWTrueValue(True) && isHWFalseValue(False) &&
1215       (CompareVT == VT || VT == MVT::i32)) {
1216     // This can be matched by a SET* instruction.
1217     return DAG.getNode(ISD::SELECT_CC, DL, VT, LHS, RHS, True, False, CC);
1218   }
1219
1220   // Try to lower to a CND* instruction:
1221   //
1222   // CND* can match the following patterns:
1223   //
1224   // select_cc f32, 0.0, f32, f32, cc_supported
1225   // select_cc f32, 0.0, i32, i32, cc_supported
1226   // select_cc i32, 0,   f32, f32, cc_supported
1227   // select_cc i32, 0,   i32, i32, cc_supported
1228   //
1229
1230   // Try to move the zero value to the RHS
1231   if (isZero(LHS)) {
1232     ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
1233     // Try swapping the operands
1234     ISD::CondCode CCSwapped = ISD::getSetCCSwappedOperands(CCOpcode);
1235     if (isCondCodeLegal(CCSwapped, CompareVT.getSimpleVT())) {
1236       std::swap(LHS, RHS);
1237       CC = DAG.getCondCode(CCSwapped);
1238     } else {
1239       // Try inverting the conditon and then swapping the operands
1240       ISD::CondCode CCInv = ISD::getSetCCInverse(CCOpcode, CompareVT.isInteger());
1241       CCSwapped = ISD::getSetCCSwappedOperands(CCInv);
1242       if (isCondCodeLegal(CCSwapped, CompareVT.getSimpleVT())) {
1243         std::swap(True, False);
1244         std::swap(LHS, RHS);
1245         CC = DAG.getCondCode(CCSwapped);
1246       }
1247     }
1248   }
1249   if (isZero(RHS)) {
1250     SDValue Cond = LHS;
1251     SDValue Zero = RHS;
1252     ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
1253     if (CompareVT != VT) {
1254       // Bitcast True / False to the correct types.  This will end up being
1255       // a nop, but it allows us to define only a single pattern in the
1256       // .TD files for each CND* instruction rather than having to have
1257       // one pattern for integer True/False and one for fp True/False
1258       True = DAG.getNode(ISD::BITCAST, DL, CompareVT, True);
1259       False = DAG.getNode(ISD::BITCAST, DL, CompareVT, False);
1260     }
1261
1262     switch (CCOpcode) {
1263     case ISD::SETONE:
1264     case ISD::SETUNE:
1265     case ISD::SETNE:
1266       CCOpcode = ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32);
1267       Temp = True;
1268       True = False;
1269       False = Temp;
1270       break;
1271     default:
1272       break;
1273     }
1274     SDValue SelectNode = DAG.getNode(ISD::SELECT_CC, DL, CompareVT,
1275         Cond, Zero,
1276         True, False,
1277         DAG.getCondCode(CCOpcode));
1278     return DAG.getNode(ISD::BITCAST, DL, VT, SelectNode);
1279   }
1280
1281   // If we make it this for it means we have no native instructions to handle
1282   // this SELECT_CC, so we must lower it.
1283   SDValue HWTrue, HWFalse;
1284
1285   if (CompareVT == MVT::f32) {
1286     HWTrue = DAG.getConstantFP(1.0f, CompareVT);
1287     HWFalse = DAG.getConstantFP(0.0f, CompareVT);
1288   } else if (CompareVT == MVT::i32) {
1289     HWTrue = DAG.getConstant(-1, CompareVT);
1290     HWFalse = DAG.getConstant(0, CompareVT);
1291   }
1292   else {
1293     llvm_unreachable("Unhandled value type in LowerSELECT_CC");
1294   }
1295
1296   // Lower this unsupported SELECT_CC into a combination of two supported
1297   // SELECT_CC operations.
1298   SDValue Cond = DAG.getNode(ISD::SELECT_CC, DL, CompareVT, LHS, RHS, HWTrue, HWFalse, CC);
1299
1300   return DAG.getNode(ISD::SELECT_CC, DL, VT,
1301       Cond, HWFalse,
1302       True, False,
1303       DAG.getCondCode(ISD::SETNE));
1304 }
1305
1306 /// LLVM generates byte-addressed pointers.  For indirect addressing, we need to
1307 /// convert these pointers to a register index.  Each register holds
1308 /// 16 bytes, (4 x 32bit sub-register), but we need to take into account the
1309 /// \p StackWidth, which tells us how many of the 4 sub-registrers will be used
1310 /// for indirect addressing.
1311 SDValue R600TargetLowering::stackPtrToRegIndex(SDValue Ptr,
1312                                                unsigned StackWidth,
1313                                                SelectionDAG &DAG) const {
1314   unsigned SRLPad;
1315   switch(StackWidth) {
1316   case 1:
1317     SRLPad = 2;
1318     break;
1319   case 2:
1320     SRLPad = 3;
1321     break;
1322   case 4:
1323     SRLPad = 4;
1324     break;
1325   default: llvm_unreachable("Invalid stack width");
1326   }
1327
1328   return DAG.getNode(ISD::SRL, SDLoc(Ptr), Ptr.getValueType(), Ptr,
1329                      DAG.getConstant(SRLPad, MVT::i32));
1330 }
1331
1332 void R600TargetLowering::getStackAddress(unsigned StackWidth,
1333                                          unsigned ElemIdx,
1334                                          unsigned &Channel,
1335                                          unsigned &PtrIncr) const {
1336   switch (StackWidth) {
1337   default:
1338   case 1:
1339     Channel = 0;
1340     if (ElemIdx > 0) {
1341       PtrIncr = 1;
1342     } else {
1343       PtrIncr = 0;
1344     }
1345     break;
1346   case 2:
1347     Channel = ElemIdx % 2;
1348     if (ElemIdx == 2) {
1349       PtrIncr = 1;
1350     } else {
1351       PtrIncr = 0;
1352     }
1353     break;
1354   case 4:
1355     Channel = ElemIdx;
1356     PtrIncr = 0;
1357     break;
1358   }
1359 }
1360
1361 SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
1362   SDLoc DL(Op);
1363   StoreSDNode *StoreNode = cast<StoreSDNode>(Op);
1364   SDValue Chain = Op.getOperand(0);
1365   SDValue Value = Op.getOperand(1);
1366   SDValue Ptr = Op.getOperand(2);
1367
1368   SDValue Result = AMDGPUTargetLowering::LowerSTORE(Op, DAG);
1369   if (Result.getNode()) {
1370     return Result;
1371   }
1372
1373   if (StoreNode->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS) {
1374     if (StoreNode->isTruncatingStore()) {
1375       EVT VT = Value.getValueType();
1376       assert(VT.bitsLE(MVT::i32));
1377       EVT MemVT = StoreNode->getMemoryVT();
1378       SDValue MaskConstant;
1379       if (MemVT == MVT::i8) {
1380         MaskConstant = DAG.getConstant(0xFF, MVT::i32);
1381       } else {
1382         assert(MemVT == MVT::i16);
1383         MaskConstant = DAG.getConstant(0xFFFF, MVT::i32);
1384       }
1385       SDValue DWordAddr = DAG.getNode(ISD::SRL, DL, VT, Ptr,
1386                                       DAG.getConstant(2, MVT::i32));
1387       SDValue ByteIndex = DAG.getNode(ISD::AND, DL, Ptr.getValueType(), Ptr,
1388                                       DAG.getConstant(0x00000003, VT));
1389       SDValue TruncValue = DAG.getNode(ISD::AND, DL, VT, Value, MaskConstant);
1390       SDValue Shift = DAG.getNode(ISD::SHL, DL, VT, ByteIndex,
1391                                    DAG.getConstant(3, VT));
1392       SDValue ShiftedValue = DAG.getNode(ISD::SHL, DL, VT, TruncValue, Shift);
1393       SDValue Mask = DAG.getNode(ISD::SHL, DL, VT, MaskConstant, Shift);
1394       // XXX: If we add a 64-bit ZW register class, then we could use a 2 x i32
1395       // vector instead.
1396       SDValue Src[4] = {
1397         ShiftedValue,
1398         DAG.getConstant(0, MVT::i32),
1399         DAG.getConstant(0, MVT::i32),
1400         Mask
1401       };
1402       SDValue Input = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v4i32, Src);
1403       SDValue Args[3] = { Chain, Input, DWordAddr };
1404       return DAG.getMemIntrinsicNode(AMDGPUISD::STORE_MSKOR, DL,
1405                                      Op->getVTList(), Args, MemVT,
1406                                      StoreNode->getMemOperand());
1407     } else if (Ptr->getOpcode() != AMDGPUISD::DWORDADDR &&
1408                Value.getValueType().bitsGE(MVT::i32)) {
1409       // Convert pointer from byte address to dword address.
1410       Ptr = DAG.getNode(AMDGPUISD::DWORDADDR, DL, Ptr.getValueType(),
1411                         DAG.getNode(ISD::SRL, DL, Ptr.getValueType(),
1412                                     Ptr, DAG.getConstant(2, MVT::i32)));
1413
1414       if (StoreNode->isTruncatingStore() || StoreNode->isIndexed()) {
1415         llvm_unreachable("Truncated and indexed stores not supported yet");
1416       } else {
1417         Chain = DAG.getStore(Chain, DL, Value, Ptr, StoreNode->getMemOperand());
1418       }
1419       return Chain;
1420     }
1421   }
1422
1423   EVT ValueVT = Value.getValueType();
1424
1425   if (StoreNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) {
1426     return SDValue();
1427   }
1428
1429   SDValue Ret = AMDGPUTargetLowering::LowerSTORE(Op, DAG);
1430   if (Ret.getNode()) {
1431     return Ret;
1432   }
1433   // Lowering for indirect addressing
1434
1435   const MachineFunction &MF = DAG.getMachineFunction();
1436   const AMDGPUFrameLowering *TFL = static_cast<const AMDGPUFrameLowering *>(
1437       getTargetMachine().getSubtargetImpl()->getFrameLowering());
1438   unsigned StackWidth = TFL->getStackWidth(MF);
1439
1440   Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG);
1441
1442   if (ValueVT.isVector()) {
1443     unsigned NumElemVT = ValueVT.getVectorNumElements();
1444     EVT ElemVT = ValueVT.getVectorElementType();
1445     SmallVector<SDValue, 4> Stores(NumElemVT);
1446
1447     assert(NumElemVT >= StackWidth && "Stack width cannot be greater than "
1448                                       "vector width in load");
1449
1450     for (unsigned i = 0; i < NumElemVT; ++i) {
1451       unsigned Channel, PtrIncr;
1452       getStackAddress(StackWidth, i, Channel, PtrIncr);
1453       Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr,
1454                         DAG.getConstant(PtrIncr, MVT::i32));
1455       SDValue Elem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ElemVT,
1456                                  Value, DAG.getConstant(i, MVT::i32));
1457
1458       Stores[i] = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other,
1459                               Chain, Elem, Ptr,
1460                               DAG.getTargetConstant(Channel, MVT::i32));
1461     }
1462      Chain =  DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores);
1463    } else {
1464     if (ValueVT == MVT::i8) {
1465       Value = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Value);
1466     }
1467     Chain = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other, Chain, Value, Ptr,
1468     DAG.getTargetConstant(0, MVT::i32)); // Channel
1469   }
1470
1471   return Chain;
1472 }
1473
1474 // return (512 + (kc_bank << 12)
1475 static int
1476 ConstantAddressBlock(unsigned AddressSpace) {
1477   switch (AddressSpace) {
1478   case AMDGPUAS::CONSTANT_BUFFER_0:
1479     return 512;
1480   case AMDGPUAS::CONSTANT_BUFFER_1:
1481     return 512 + 4096;
1482   case AMDGPUAS::CONSTANT_BUFFER_2:
1483     return 512 + 4096 * 2;
1484   case AMDGPUAS::CONSTANT_BUFFER_3:
1485     return 512 + 4096 * 3;
1486   case AMDGPUAS::CONSTANT_BUFFER_4:
1487     return 512 + 4096 * 4;
1488   case AMDGPUAS::CONSTANT_BUFFER_5:
1489     return 512 + 4096 * 5;
1490   case AMDGPUAS::CONSTANT_BUFFER_6:
1491     return 512 + 4096 * 6;
1492   case AMDGPUAS::CONSTANT_BUFFER_7:
1493     return 512 + 4096 * 7;
1494   case AMDGPUAS::CONSTANT_BUFFER_8:
1495     return 512 + 4096 * 8;
1496   case AMDGPUAS::CONSTANT_BUFFER_9:
1497     return 512 + 4096 * 9;
1498   case AMDGPUAS::CONSTANT_BUFFER_10:
1499     return 512 + 4096 * 10;
1500   case AMDGPUAS::CONSTANT_BUFFER_11:
1501     return 512 + 4096 * 11;
1502   case AMDGPUAS::CONSTANT_BUFFER_12:
1503     return 512 + 4096 * 12;
1504   case AMDGPUAS::CONSTANT_BUFFER_13:
1505     return 512 + 4096 * 13;
1506   case AMDGPUAS::CONSTANT_BUFFER_14:
1507     return 512 + 4096 * 14;
1508   case AMDGPUAS::CONSTANT_BUFFER_15:
1509     return 512 + 4096 * 15;
1510   default:
1511     return -1;
1512   }
1513 }
1514
1515 SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const
1516 {
1517   EVT VT = Op.getValueType();
1518   SDLoc DL(Op);
1519   LoadSDNode *LoadNode = cast<LoadSDNode>(Op);
1520   SDValue Chain = Op.getOperand(0);
1521   SDValue Ptr = Op.getOperand(1);
1522   SDValue LoweredLoad;
1523
1524   SDValue Ret = AMDGPUTargetLowering::LowerLOAD(Op, DAG);
1525   if (Ret.getNode()) {
1526     SDValue Ops[2] = {
1527       Ret,
1528       Chain
1529     };
1530     return DAG.getMergeValues(Ops, DL);
1531   }
1532
1533   // Lower loads constant address space global variable loads
1534   if (LoadNode->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS &&
1535       isa<GlobalVariable>(
1536           GetUnderlyingObject(LoadNode->getMemOperand()->getValue()))) {
1537
1538     SDValue Ptr = DAG.getZExtOrTrunc(LoadNode->getBasePtr(), DL,
1539         getPointerTy(AMDGPUAS::PRIVATE_ADDRESS));
1540     Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr,
1541         DAG.getConstant(2, MVT::i32));
1542     return DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, Op->getVTList(),
1543                        LoadNode->getChain(), Ptr,
1544                        DAG.getTargetConstant(0, MVT::i32), Op.getOperand(2));
1545   }
1546
1547   if (LoadNode->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS && VT.isVector()) {
1548     SDValue MergedValues[2] = {
1549       ScalarizeVectorLoad(Op, DAG),
1550       Chain
1551     };
1552     return DAG.getMergeValues(MergedValues, DL);
1553   }
1554
1555   int ConstantBlock = ConstantAddressBlock(LoadNode->getAddressSpace());
1556   if (ConstantBlock > -1 &&
1557       ((LoadNode->getExtensionType() == ISD::NON_EXTLOAD) ||
1558        (LoadNode->getExtensionType() == ISD::ZEXTLOAD))) {
1559     SDValue Result;
1560     if (isa<ConstantExpr>(LoadNode->getMemOperand()->getValue()) ||
1561         isa<Constant>(LoadNode->getMemOperand()->getValue()) ||
1562         isa<ConstantSDNode>(Ptr)) {
1563       SDValue Slots[4];
1564       for (unsigned i = 0; i < 4; i++) {
1565         // We want Const position encoded with the following formula :
1566         // (((512 + (kc_bank << 12) + const_index) << 2) + chan)
1567         // const_index is Ptr computed by llvm using an alignment of 16.
1568         // Thus we add (((512 + (kc_bank << 12)) + chan ) * 4 here and
1569         // then div by 4 at the ISel step
1570         SDValue NewPtr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
1571             DAG.getConstant(4 * i + ConstantBlock * 16, MVT::i32));
1572         Slots[i] = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::i32, NewPtr);
1573       }
1574       EVT NewVT = MVT::v4i32;
1575       unsigned NumElements = 4;
1576       if (VT.isVector()) {
1577         NewVT = VT;
1578         NumElements = VT.getVectorNumElements();
1579       }
1580       Result = DAG.getNode(ISD::BUILD_VECTOR, DL, NewVT,
1581                            makeArrayRef(Slots, NumElements));
1582     } else {
1583       // non-constant ptr can't be folded, keeps it as a v4f32 load
1584       Result = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::v4i32,
1585           DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr, DAG.getConstant(4, MVT::i32)),
1586           DAG.getConstant(LoadNode->getAddressSpace() -
1587                           AMDGPUAS::CONSTANT_BUFFER_0, MVT::i32)
1588           );
1589     }
1590
1591     if (!VT.isVector()) {
1592       Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Result,
1593           DAG.getConstant(0, MVT::i32));
1594     }
1595
1596     SDValue MergedValues[2] = {
1597       Result,
1598       Chain
1599     };
1600     return DAG.getMergeValues(MergedValues, DL);
1601   }
1602
1603   // For most operations returning SDValue() will result in the node being
1604   // expanded by the DAG Legalizer. This is not the case for ISD::LOAD, so we
1605   // need to manually expand loads that may be legal in some address spaces and
1606   // illegal in others. SEXT loads from CONSTANT_BUFFER_0 are supported for
1607   // compute shaders, since the data is sign extended when it is uploaded to the
1608   // buffer. However SEXT loads from other address spaces are not supported, so
1609   // we need to expand them here.
1610   if (LoadNode->getExtensionType() == ISD::SEXTLOAD) {
1611     EVT MemVT = LoadNode->getMemoryVT();
1612     assert(!MemVT.isVector() && (MemVT == MVT::i16 || MemVT == MVT::i8));
1613     SDValue ShiftAmount =
1614           DAG.getConstant(VT.getSizeInBits() - MemVT.getSizeInBits(), MVT::i32);
1615     SDValue NewLoad = DAG.getExtLoad(ISD::EXTLOAD, DL, VT, Chain, Ptr,
1616                                   LoadNode->getPointerInfo(), MemVT,
1617                                   LoadNode->isVolatile(),
1618                                   LoadNode->isNonTemporal(),
1619                                   LoadNode->isInvariant(),
1620                                   LoadNode->getAlignment());
1621     SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, NewLoad, ShiftAmount);
1622     SDValue Sra = DAG.getNode(ISD::SRA, DL, VT, Shl, ShiftAmount);
1623
1624     SDValue MergedValues[2] = { Sra, Chain };
1625     return DAG.getMergeValues(MergedValues, DL);
1626   }
1627
1628   if (LoadNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) {
1629     return SDValue();
1630   }
1631
1632   // Lowering for indirect addressing
1633   const MachineFunction &MF = DAG.getMachineFunction();
1634   const AMDGPUFrameLowering *TFL = static_cast<const AMDGPUFrameLowering *>(
1635       getTargetMachine().getSubtargetImpl()->getFrameLowering());
1636   unsigned StackWidth = TFL->getStackWidth(MF);
1637
1638   Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG);
1639
1640   if (VT.isVector()) {
1641     unsigned NumElemVT = VT.getVectorNumElements();
1642     EVT ElemVT = VT.getVectorElementType();
1643     SDValue Loads[4];
1644
1645     assert(NumElemVT >= StackWidth && "Stack width cannot be greater than "
1646                                       "vector width in load");
1647
1648     for (unsigned i = 0; i < NumElemVT; ++i) {
1649       unsigned Channel, PtrIncr;
1650       getStackAddress(StackWidth, i, Channel, PtrIncr);
1651       Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr,
1652                         DAG.getConstant(PtrIncr, MVT::i32));
1653       Loads[i] = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, ElemVT,
1654                              Chain, Ptr,
1655                              DAG.getTargetConstant(Channel, MVT::i32),
1656                              Op.getOperand(2));
1657     }
1658     for (unsigned i = NumElemVT; i < 4; ++i) {
1659       Loads[i] = DAG.getUNDEF(ElemVT);
1660     }
1661     EVT TargetVT = EVT::getVectorVT(*DAG.getContext(), ElemVT, 4);
1662     LoweredLoad = DAG.getNode(ISD::BUILD_VECTOR, DL, TargetVT, Loads);
1663   } else {
1664     LoweredLoad = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, VT,
1665                               Chain, Ptr,
1666                               DAG.getTargetConstant(0, MVT::i32), // Channel
1667                               Op.getOperand(2));
1668   }
1669
1670   SDValue Ops[2] = {
1671     LoweredLoad,
1672     Chain
1673   };
1674
1675   return DAG.getMergeValues(Ops, DL);
1676 }
1677
1678 SDValue R600TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
1679   SDValue Chain = Op.getOperand(0);
1680   SDValue Cond  = Op.getOperand(1);
1681   SDValue Jump  = Op.getOperand(2);
1682
1683   return DAG.getNode(AMDGPUISD::BRANCH_COND, SDLoc(Op), Op.getValueType(),
1684                      Chain, Jump, Cond);
1685 }
1686
1687 /// XXX Only kernel functions are supported, so we can assume for now that
1688 /// every function is a kernel function, but in the future we should use
1689 /// separate calling conventions for kernel and non-kernel functions.
1690 SDValue R600TargetLowering::LowerFormalArguments(
1691                                       SDValue Chain,
1692                                       CallingConv::ID CallConv,
1693                                       bool isVarArg,
1694                                       const SmallVectorImpl<ISD::InputArg> &Ins,
1695                                       SDLoc DL, SelectionDAG &DAG,
1696                                       SmallVectorImpl<SDValue> &InVals) const {
1697   SmallVector<CCValAssign, 16> ArgLocs;
1698   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
1699                  *DAG.getContext());
1700   MachineFunction &MF = DAG.getMachineFunction();
1701   unsigned ShaderType = MF.getInfo<R600MachineFunctionInfo>()->getShaderType();
1702
1703   SmallVector<ISD::InputArg, 8> LocalIns;
1704
1705   getOriginalFunctionArgs(DAG, MF.getFunction(), Ins, LocalIns);
1706
1707   AnalyzeFormalArguments(CCInfo, LocalIns);
1708
1709   for (unsigned i = 0, e = Ins.size(); i < e; ++i) {
1710     CCValAssign &VA = ArgLocs[i];
1711     const ISD::InputArg &In = Ins[i];
1712     EVT VT = In.VT;
1713     EVT MemVT = VA.getLocVT();
1714     if (!VT.isVector() && MemVT.isVector()) {
1715       // Get load source type if scalarized.
1716       MemVT = MemVT.getVectorElementType();
1717     }
1718
1719     if (ShaderType != ShaderType::COMPUTE) {
1720       unsigned Reg = MF.addLiveIn(VA.getLocReg(), &AMDGPU::R600_Reg128RegClass);
1721       SDValue Register = DAG.getCopyFromReg(Chain, DL, Reg, VT);
1722       InVals.push_back(Register);
1723       continue;
1724     }
1725
1726     PointerType *PtrTy = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
1727                                           AMDGPUAS::CONSTANT_BUFFER_0);
1728
1729     // i64 isn't a legal type, so the register type used ends up as i32, which
1730     // isn't expected here. It attempts to create this sextload, but it ends up
1731     // being invalid. Somehow this seems to work with i64 arguments, but breaks
1732     // for <1 x i64>.
1733
1734     // The first 36 bytes of the input buffer contains information about
1735     // thread group and global sizes.
1736     ISD::LoadExtType Ext = ISD::NON_EXTLOAD;
1737     if (MemVT.getScalarSizeInBits() != VT.getScalarSizeInBits()) {
1738       // FIXME: This should really check the extload type, but the handling of
1739       // extload vector parameters seems to be broken.
1740
1741       // Ext = In.Flags.isSExt() ? ISD::SEXTLOAD : ISD::ZEXTLOAD;
1742       Ext = ISD::SEXTLOAD;
1743     }
1744
1745     // Compute the offset from the value.
1746     // XXX - I think PartOffset should give you this, but it seems to give the
1747     // size of the register which isn't useful.
1748
1749     unsigned ValBase = ArgLocs[In.OrigArgIndex].getLocMemOffset();
1750     unsigned PartOffset = VA.getLocMemOffset();
1751
1752     MachinePointerInfo PtrInfo(UndefValue::get(PtrTy), PartOffset - ValBase);
1753     SDValue Arg = DAG.getLoad(ISD::UNINDEXED, Ext, VT, DL, Chain,
1754                               DAG.getConstant(36 + PartOffset, MVT::i32),
1755                               DAG.getUNDEF(MVT::i32),
1756                               PtrInfo,
1757                               MemVT, false, true, true, 4);
1758
1759     // 4 is the preferred alignment for the CONSTANT memory space.
1760     InVals.push_back(Arg);
1761   }
1762   return Chain;
1763 }
1764
1765 EVT R600TargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const {
1766    if (!VT.isVector())
1767      return MVT::i32;
1768    return VT.changeVectorElementTypeToInteger();
1769 }
1770
1771 static SDValue CompactSwizzlableVector(
1772   SelectionDAG &DAG, SDValue VectorEntry,
1773   DenseMap<unsigned, unsigned> &RemapSwizzle) {
1774   assert(VectorEntry.getOpcode() == ISD::BUILD_VECTOR);
1775   assert(RemapSwizzle.empty());
1776   SDValue NewBldVec[4] = {
1777     VectorEntry.getOperand(0),
1778     VectorEntry.getOperand(1),
1779     VectorEntry.getOperand(2),
1780     VectorEntry.getOperand(3)
1781   };
1782
1783   for (unsigned i = 0; i < 4; i++) {
1784     if (NewBldVec[i].getOpcode() == ISD::UNDEF)
1785       // We mask write here to teach later passes that the ith element of this
1786       // vector is undef. Thus we can use it to reduce 128 bits reg usage,
1787       // break false dependencies and additionnaly make assembly easier to read.
1788       RemapSwizzle[i] = 7; // SEL_MASK_WRITE
1789     if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(NewBldVec[i])) {
1790       if (C->isZero()) {
1791         RemapSwizzle[i] = 4; // SEL_0
1792         NewBldVec[i] = DAG.getUNDEF(MVT::f32);
1793       } else if (C->isExactlyValue(1.0)) {
1794         RemapSwizzle[i] = 5; // SEL_1
1795         NewBldVec[i] = DAG.getUNDEF(MVT::f32);
1796       }
1797     }
1798
1799     if (NewBldVec[i].getOpcode() == ISD::UNDEF)
1800       continue;
1801     for (unsigned j = 0; j < i; j++) {
1802       if (NewBldVec[i] == NewBldVec[j]) {
1803         NewBldVec[i] = DAG.getUNDEF(NewBldVec[i].getValueType());
1804         RemapSwizzle[i] = j;
1805         break;
1806       }
1807     }
1808   }
1809
1810   return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(VectorEntry),
1811                      VectorEntry.getValueType(), NewBldVec);
1812 }
1813
1814 static SDValue ReorganizeVector(SelectionDAG &DAG, SDValue VectorEntry,
1815                                 DenseMap<unsigned, unsigned> &RemapSwizzle) {
1816   assert(VectorEntry.getOpcode() == ISD::BUILD_VECTOR);
1817   assert(RemapSwizzle.empty());
1818   SDValue NewBldVec[4] = {
1819       VectorEntry.getOperand(0),
1820       VectorEntry.getOperand(1),
1821       VectorEntry.getOperand(2),
1822       VectorEntry.getOperand(3)
1823   };
1824   bool isUnmovable[4] = { false, false, false, false };
1825   for (unsigned i = 0; i < 4; i++) {
1826     RemapSwizzle[i] = i;
1827     if (NewBldVec[i].getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
1828       unsigned Idx = dyn_cast<ConstantSDNode>(NewBldVec[i].getOperand(1))
1829           ->getZExtValue();
1830       if (i == Idx)
1831         isUnmovable[Idx] = true;
1832     }
1833   }
1834
1835   for (unsigned i = 0; i < 4; i++) {
1836     if (NewBldVec[i].getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
1837       unsigned Idx = dyn_cast<ConstantSDNode>(NewBldVec[i].getOperand(1))
1838           ->getZExtValue();
1839       if (isUnmovable[Idx])
1840         continue;
1841       // Swap i and Idx
1842       std::swap(NewBldVec[Idx], NewBldVec[i]);
1843       std::swap(RemapSwizzle[i], RemapSwizzle[Idx]);
1844       break;
1845     }
1846   }
1847
1848   return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(VectorEntry),
1849                      VectorEntry.getValueType(), NewBldVec);
1850 }
1851
1852
1853 SDValue R600TargetLowering::OptimizeSwizzle(SDValue BuildVector,
1854 SDValue Swz[4], SelectionDAG &DAG) const {
1855   assert(BuildVector.getOpcode() == ISD::BUILD_VECTOR);
1856   // Old -> New swizzle values
1857   DenseMap<unsigned, unsigned> SwizzleRemap;
1858
1859   BuildVector = CompactSwizzlableVector(DAG, BuildVector, SwizzleRemap);
1860   for (unsigned i = 0; i < 4; i++) {
1861     unsigned Idx = dyn_cast<ConstantSDNode>(Swz[i])->getZExtValue();
1862     if (SwizzleRemap.find(Idx) != SwizzleRemap.end())
1863       Swz[i] = DAG.getConstant(SwizzleRemap[Idx], MVT::i32);
1864   }
1865
1866   SwizzleRemap.clear();
1867   BuildVector = ReorganizeVector(DAG, BuildVector, SwizzleRemap);
1868   for (unsigned i = 0; i < 4; i++) {
1869     unsigned Idx = dyn_cast<ConstantSDNode>(Swz[i])->getZExtValue();
1870     if (SwizzleRemap.find(Idx) != SwizzleRemap.end())
1871       Swz[i] = DAG.getConstant(SwizzleRemap[Idx], MVT::i32);
1872   }
1873
1874   return BuildVector;
1875 }
1876
1877
1878 //===----------------------------------------------------------------------===//
1879 // Custom DAG Optimizations
1880 //===----------------------------------------------------------------------===//
1881
1882 SDValue R600TargetLowering::PerformDAGCombine(SDNode *N,
1883                                               DAGCombinerInfo &DCI) const {
1884   SelectionDAG &DAG = DCI.DAG;
1885
1886   switch (N->getOpcode()) {
1887   default: return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
1888   // (f32 fp_round (f64 uint_to_fp a)) -> (f32 uint_to_fp a)
1889   case ISD::FP_ROUND: {
1890       SDValue Arg = N->getOperand(0);
1891       if (Arg.getOpcode() == ISD::UINT_TO_FP && Arg.getValueType() == MVT::f64) {
1892         return DAG.getNode(ISD::UINT_TO_FP, SDLoc(N), N->getValueType(0),
1893                            Arg.getOperand(0));
1894       }
1895       break;
1896     }
1897
1898   // (i32 fp_to_sint (fneg (select_cc f32, f32, 1.0, 0.0 cc))) ->
1899   // (i32 select_cc f32, f32, -1, 0 cc)
1900   //
1901   // Mesa's GLSL frontend generates the above pattern a lot and we can lower
1902   // this to one of the SET*_DX10 instructions.
1903   case ISD::FP_TO_SINT: {
1904     SDValue FNeg = N->getOperand(0);
1905     if (FNeg.getOpcode() != ISD::FNEG) {
1906       return SDValue();
1907     }
1908     SDValue SelectCC = FNeg.getOperand(0);
1909     if (SelectCC.getOpcode() != ISD::SELECT_CC ||
1910         SelectCC.getOperand(0).getValueType() != MVT::f32 || // LHS
1911         SelectCC.getOperand(2).getValueType() != MVT::f32 || // True
1912         !isHWTrueValue(SelectCC.getOperand(2)) ||
1913         !isHWFalseValue(SelectCC.getOperand(3))) {
1914       return SDValue();
1915     }
1916
1917     return DAG.getNode(ISD::SELECT_CC, SDLoc(N), N->getValueType(0),
1918                            SelectCC.getOperand(0), // LHS
1919                            SelectCC.getOperand(1), // RHS
1920                            DAG.getConstant(-1, MVT::i32), // True
1921                            DAG.getConstant(0, MVT::i32),  // Flase
1922                            SelectCC.getOperand(4)); // CC
1923
1924     break;
1925   }
1926
1927   // insert_vector_elt (build_vector elt0, ... , eltN), NewEltIdx, idx
1928   // => build_vector elt0, ... , NewEltIdx, ... , eltN
1929   case ISD::INSERT_VECTOR_ELT: {
1930     SDValue InVec = N->getOperand(0);
1931     SDValue InVal = N->getOperand(1);
1932     SDValue EltNo = N->getOperand(2);
1933     SDLoc dl(N);
1934
1935     // If the inserted element is an UNDEF, just use the input vector.
1936     if (InVal.getOpcode() == ISD::UNDEF)
1937       return InVec;
1938
1939     EVT VT = InVec.getValueType();
1940
1941     // If we can't generate a legal BUILD_VECTOR, exit
1942     if (!isOperationLegal(ISD::BUILD_VECTOR, VT))
1943       return SDValue();
1944
1945     // Check that we know which element is being inserted
1946     if (!isa<ConstantSDNode>(EltNo))
1947       return SDValue();
1948     unsigned Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
1949
1950     // Check that the operand is a BUILD_VECTOR (or UNDEF, which can essentially
1951     // be converted to a BUILD_VECTOR).  Fill in the Ops vector with the
1952     // vector elements.
1953     SmallVector<SDValue, 8> Ops;
1954     if (InVec.getOpcode() == ISD::BUILD_VECTOR) {
1955       Ops.append(InVec.getNode()->op_begin(),
1956                  InVec.getNode()->op_end());
1957     } else if (InVec.getOpcode() == ISD::UNDEF) {
1958       unsigned NElts = VT.getVectorNumElements();
1959       Ops.append(NElts, DAG.getUNDEF(InVal.getValueType()));
1960     } else {
1961       return SDValue();
1962     }
1963
1964     // Insert the element
1965     if (Elt < Ops.size()) {
1966       // All the operands of BUILD_VECTOR must have the same type;
1967       // we enforce that here.
1968       EVT OpVT = Ops[0].getValueType();
1969       if (InVal.getValueType() != OpVT)
1970         InVal = OpVT.bitsGT(InVal.getValueType()) ?
1971           DAG.getNode(ISD::ANY_EXTEND, dl, OpVT, InVal) :
1972           DAG.getNode(ISD::TRUNCATE, dl, OpVT, InVal);
1973       Ops[Elt] = InVal;
1974     }
1975
1976     // Return the new vector
1977     return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
1978   }
1979
1980   // Extract_vec (Build_vector) generated by custom lowering
1981   // also needs to be customly combined
1982   case ISD::EXTRACT_VECTOR_ELT: {
1983     SDValue Arg = N->getOperand(0);
1984     if (Arg.getOpcode() == ISD::BUILD_VECTOR) {
1985       if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
1986         unsigned Element = Const->getZExtValue();
1987         return Arg->getOperand(Element);
1988       }
1989     }
1990     if (Arg.getOpcode() == ISD::BITCAST &&
1991         Arg.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
1992       if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
1993         unsigned Element = Const->getZExtValue();
1994         return DAG.getNode(ISD::BITCAST, SDLoc(N), N->getVTList(),
1995             Arg->getOperand(0).getOperand(Element));
1996       }
1997     }
1998   }
1999
2000   case ISD::SELECT_CC: {
2001     // Try common optimizations
2002     SDValue Ret = AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
2003     if (Ret.getNode())
2004       return Ret;
2005
2006     // fold selectcc (selectcc x, y, a, b, cc), b, a, b, seteq ->
2007     //      selectcc x, y, a, b, inv(cc)
2008     //
2009     // fold selectcc (selectcc x, y, a, b, cc), b, a, b, setne ->
2010     //      selectcc x, y, a, b, cc
2011     SDValue LHS = N->getOperand(0);
2012     if (LHS.getOpcode() != ISD::SELECT_CC) {
2013       return SDValue();
2014     }
2015
2016     SDValue RHS = N->getOperand(1);
2017     SDValue True = N->getOperand(2);
2018     SDValue False = N->getOperand(3);
2019     ISD::CondCode NCC = cast<CondCodeSDNode>(N->getOperand(4))->get();
2020
2021     if (LHS.getOperand(2).getNode() != True.getNode() ||
2022         LHS.getOperand(3).getNode() != False.getNode() ||
2023         RHS.getNode() != False.getNode()) {
2024       return SDValue();
2025     }
2026
2027     switch (NCC) {
2028     default: return SDValue();
2029     case ISD::SETNE: return LHS;
2030     case ISD::SETEQ: {
2031       ISD::CondCode LHSCC = cast<CondCodeSDNode>(LHS.getOperand(4))->get();
2032       LHSCC = ISD::getSetCCInverse(LHSCC,
2033                                   LHS.getOperand(0).getValueType().isInteger());
2034       if (DCI.isBeforeLegalizeOps() ||
2035           isCondCodeLegal(LHSCC, LHS.getOperand(0).getSimpleValueType()))
2036         return DAG.getSelectCC(SDLoc(N),
2037                                LHS.getOperand(0),
2038                                LHS.getOperand(1),
2039                                LHS.getOperand(2),
2040                                LHS.getOperand(3),
2041                                LHSCC);
2042       break;
2043     }
2044     }
2045     return SDValue();
2046   }
2047
2048   case AMDGPUISD::EXPORT: {
2049     SDValue Arg = N->getOperand(1);
2050     if (Arg.getOpcode() != ISD::BUILD_VECTOR)
2051       break;
2052
2053     SDValue NewArgs[8] = {
2054       N->getOperand(0), // Chain
2055       SDValue(),
2056       N->getOperand(2), // ArrayBase
2057       N->getOperand(3), // Type
2058       N->getOperand(4), // SWZ_X
2059       N->getOperand(5), // SWZ_Y
2060       N->getOperand(6), // SWZ_Z
2061       N->getOperand(7) // SWZ_W
2062     };
2063     SDLoc DL(N);
2064     NewArgs[1] = OptimizeSwizzle(N->getOperand(1), &NewArgs[4], DAG);
2065     return DAG.getNode(AMDGPUISD::EXPORT, DL, N->getVTList(), NewArgs);
2066   }
2067   case AMDGPUISD::TEXTURE_FETCH: {
2068     SDValue Arg = N->getOperand(1);
2069     if (Arg.getOpcode() != ISD::BUILD_VECTOR)
2070       break;
2071
2072     SDValue NewArgs[19] = {
2073       N->getOperand(0),
2074       N->getOperand(1),
2075       N->getOperand(2),
2076       N->getOperand(3),
2077       N->getOperand(4),
2078       N->getOperand(5),
2079       N->getOperand(6),
2080       N->getOperand(7),
2081       N->getOperand(8),
2082       N->getOperand(9),
2083       N->getOperand(10),
2084       N->getOperand(11),
2085       N->getOperand(12),
2086       N->getOperand(13),
2087       N->getOperand(14),
2088       N->getOperand(15),
2089       N->getOperand(16),
2090       N->getOperand(17),
2091       N->getOperand(18),
2092     };
2093     NewArgs[1] = OptimizeSwizzle(N->getOperand(1), &NewArgs[2], DAG);
2094     return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, SDLoc(N), N->getVTList(),
2095         NewArgs);
2096   }
2097   }
2098
2099   return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
2100 }
2101
2102 static bool
2103 FoldOperand(SDNode *ParentNode, unsigned SrcIdx, SDValue &Src, SDValue &Neg,
2104             SDValue &Abs, SDValue &Sel, SDValue &Imm, SelectionDAG &DAG) {
2105   const R600InstrInfo *TII =
2106       static_cast<const R600InstrInfo *>(DAG.getSubtarget().getInstrInfo());
2107   if (!Src.isMachineOpcode())
2108     return false;
2109   switch (Src.getMachineOpcode()) {
2110   case AMDGPU::FNEG_R600:
2111     if (!Neg.getNode())
2112       return false;
2113     Src = Src.getOperand(0);
2114     Neg = DAG.getTargetConstant(1, MVT::i32);
2115     return true;
2116   case AMDGPU::FABS_R600:
2117     if (!Abs.getNode())
2118       return false;
2119     Src = Src.getOperand(0);
2120     Abs = DAG.getTargetConstant(1, MVT::i32);
2121     return true;
2122   case AMDGPU::CONST_COPY: {
2123     unsigned Opcode = ParentNode->getMachineOpcode();
2124     bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1;
2125
2126     if (!Sel.getNode())
2127       return false;
2128
2129     SDValue CstOffset = Src.getOperand(0);
2130     if (ParentNode->getValueType(0).isVector())
2131       return false;
2132
2133     // Gather constants values
2134     int SrcIndices[] = {
2135       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0),
2136       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1),
2137       TII->getOperandIdx(Opcode, AMDGPU::OpName::src2),
2138       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_X),
2139       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Y),
2140       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Z),
2141       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_W),
2142       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_X),
2143       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Y),
2144       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Z),
2145       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_W)
2146     };
2147     std::vector<unsigned> Consts;
2148     for (int OtherSrcIdx : SrcIndices) {
2149       int OtherSelIdx = TII->getSelIdx(Opcode, OtherSrcIdx);
2150       if (OtherSrcIdx < 0 || OtherSelIdx < 0)
2151         continue;
2152       if (HasDst) {
2153         OtherSrcIdx--;
2154         OtherSelIdx--;
2155       }
2156       if (RegisterSDNode *Reg =
2157           dyn_cast<RegisterSDNode>(ParentNode->getOperand(OtherSrcIdx))) {
2158         if (Reg->getReg() == AMDGPU::ALU_CONST) {
2159           ConstantSDNode *Cst
2160             = cast<ConstantSDNode>(ParentNode->getOperand(OtherSelIdx));
2161           Consts.push_back(Cst->getZExtValue());
2162         }
2163       }
2164     }
2165
2166     ConstantSDNode *Cst = cast<ConstantSDNode>(CstOffset);
2167     Consts.push_back(Cst->getZExtValue());
2168     if (!TII->fitsConstReadLimitations(Consts)) {
2169       return false;
2170     }
2171
2172     Sel = CstOffset;
2173     Src = DAG.getRegister(AMDGPU::ALU_CONST, MVT::f32);
2174     return true;
2175   }
2176   case AMDGPU::MOV_IMM_I32:
2177   case AMDGPU::MOV_IMM_F32: {
2178     unsigned ImmReg = AMDGPU::ALU_LITERAL_X;
2179     uint64_t ImmValue = 0;
2180
2181
2182     if (Src.getMachineOpcode() == AMDGPU::MOV_IMM_F32) {
2183       ConstantFPSDNode *FPC = dyn_cast<ConstantFPSDNode>(Src.getOperand(0));
2184       float FloatValue = FPC->getValueAPF().convertToFloat();
2185       if (FloatValue == 0.0) {
2186         ImmReg = AMDGPU::ZERO;
2187       } else if (FloatValue == 0.5) {
2188         ImmReg = AMDGPU::HALF;
2189       } else if (FloatValue == 1.0) {
2190         ImmReg = AMDGPU::ONE;
2191       } else {
2192         ImmValue = FPC->getValueAPF().bitcastToAPInt().getZExtValue();
2193       }
2194     } else {
2195       ConstantSDNode *C = dyn_cast<ConstantSDNode>(Src.getOperand(0));
2196       uint64_t Value = C->getZExtValue();
2197       if (Value == 0) {
2198         ImmReg = AMDGPU::ZERO;
2199       } else if (Value == 1) {
2200         ImmReg = AMDGPU::ONE_INT;
2201       } else {
2202         ImmValue = Value;
2203       }
2204     }
2205
2206     // Check that we aren't already using an immediate.
2207     // XXX: It's possible for an instruction to have more than one
2208     // immediate operand, but this is not supported yet.
2209     if (ImmReg == AMDGPU::ALU_LITERAL_X) {
2210       if (!Imm.getNode())
2211         return false;
2212       ConstantSDNode *C = dyn_cast<ConstantSDNode>(Imm);
2213       assert(C);
2214       if (C->getZExtValue())
2215         return false;
2216       Imm = DAG.getTargetConstant(ImmValue, MVT::i32);
2217     }
2218     Src = DAG.getRegister(ImmReg, MVT::i32);
2219     return true;
2220   }
2221   default:
2222     return false;
2223   }
2224 }
2225
2226
2227 /// \brief Fold the instructions after selecting them
2228 SDNode *R600TargetLowering::PostISelFolding(MachineSDNode *Node,
2229                                             SelectionDAG &DAG) const {
2230   const R600InstrInfo *TII =
2231       static_cast<const R600InstrInfo *>(DAG.getSubtarget().getInstrInfo());
2232   if (!Node->isMachineOpcode())
2233     return Node;
2234   unsigned Opcode = Node->getMachineOpcode();
2235   SDValue FakeOp;
2236
2237   std::vector<SDValue> Ops;
2238   for (const SDUse &I : Node->ops())
2239     Ops.push_back(I);
2240
2241   if (Opcode == AMDGPU::DOT_4) {
2242     int OperandIdx[] = {
2243       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_X),
2244       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Y),
2245       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Z),
2246       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_W),
2247       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_X),
2248       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Y),
2249       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Z),
2250       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_W)
2251         };
2252     int NegIdx[] = {
2253       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_X),
2254       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_Y),
2255       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_Z),
2256       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_W),
2257       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_X),
2258       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_Y),
2259       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_Z),
2260       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_W)
2261     };
2262     int AbsIdx[] = {
2263       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_X),
2264       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_Y),
2265       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_Z),
2266       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_W),
2267       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_X),
2268       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_Y),
2269       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_Z),
2270       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_W)
2271     };
2272     for (unsigned i = 0; i < 8; i++) {
2273       if (OperandIdx[i] < 0)
2274         return Node;
2275       SDValue &Src = Ops[OperandIdx[i] - 1];
2276       SDValue &Neg = Ops[NegIdx[i] - 1];
2277       SDValue &Abs = Ops[AbsIdx[i] - 1];
2278       bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1;
2279       int SelIdx = TII->getSelIdx(Opcode, OperandIdx[i]);
2280       if (HasDst)
2281         SelIdx--;
2282       SDValue &Sel = (SelIdx > -1) ? Ops[SelIdx] : FakeOp;
2283       if (FoldOperand(Node, i, Src, Neg, Abs, Sel, FakeOp, DAG))
2284         return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
2285     }
2286   } else if (Opcode == AMDGPU::REG_SEQUENCE) {
2287     for (unsigned i = 1, e = Node->getNumOperands(); i < e; i += 2) {
2288       SDValue &Src = Ops[i];
2289       if (FoldOperand(Node, i, Src, FakeOp, FakeOp, FakeOp, FakeOp, DAG))
2290         return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
2291     }
2292   } else if (Opcode == AMDGPU::CLAMP_R600) {
2293     SDValue Src = Node->getOperand(0);
2294     if (!Src.isMachineOpcode() ||
2295         !TII->hasInstrModifiers(Src.getMachineOpcode()))
2296       return Node;
2297     int ClampIdx = TII->getOperandIdx(Src.getMachineOpcode(),
2298         AMDGPU::OpName::clamp);
2299     if (ClampIdx < 0)
2300       return Node;
2301     std::vector<SDValue> Ops;
2302     unsigned NumOp = Src.getNumOperands();
2303     for(unsigned i = 0; i < NumOp; ++i)
2304           Ops.push_back(Src.getOperand(i));
2305     Ops[ClampIdx - 1] = DAG.getTargetConstant(1, MVT::i32);
2306     return DAG.getMachineNode(Src.getMachineOpcode(), SDLoc(Node),
2307         Node->getVTList(), Ops);
2308   } else {
2309     if (!TII->hasInstrModifiers(Opcode))
2310       return Node;
2311     int OperandIdx[] = {
2312       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0),
2313       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1),
2314       TII->getOperandIdx(Opcode, AMDGPU::OpName::src2)
2315     };
2316     int NegIdx[] = {
2317       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg),
2318       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg),
2319       TII->getOperandIdx(Opcode, AMDGPU::OpName::src2_neg)
2320     };
2321     int AbsIdx[] = {
2322       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs),
2323       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs),
2324       -1
2325     };
2326     for (unsigned i = 0; i < 3; i++) {
2327       if (OperandIdx[i] < 0)
2328         return Node;
2329       SDValue &Src = Ops[OperandIdx[i] - 1];
2330       SDValue &Neg = Ops[NegIdx[i] - 1];
2331       SDValue FakeAbs;
2332       SDValue &Abs = (AbsIdx[i] > -1) ? Ops[AbsIdx[i] - 1] : FakeAbs;
2333       bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1;
2334       int SelIdx = TII->getSelIdx(Opcode, OperandIdx[i]);
2335       int ImmIdx = TII->getOperandIdx(Opcode, AMDGPU::OpName::literal);
2336       if (HasDst) {
2337         SelIdx--;
2338         ImmIdx--;
2339       }
2340       SDValue &Sel = (SelIdx > -1) ? Ops[SelIdx] : FakeOp;
2341       SDValue &Imm = Ops[ImmIdx];
2342       if (FoldOperand(Node, i, Src, Neg, Abs, Sel, Imm, DAG))
2343         return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
2344     }
2345   }
2346
2347   return Node;
2348 }