lib/Target/R600/R600ISelLowering.cpp

   1 //===-- R600ISelLowering.cpp - R600 DAG Lowering Implementation -----------===//
   2 //
   3 //                     The LLVM Compiler Infrastructure
   4 //
   5 // This file is distributed under the University of Illinois Open Source
   6 // License. See LICENSE.TXT for details.
   7 //
   8 //===----------------------------------------------------------------------===//
   9 //
  10 /// \file
  11 /// \brief Custom DAG lowering for R600
  12 //
  13 //===----------------------------------------------------------------------===//
  14
  15 #include "R600ISelLowering.h"
  16 #include "R600Defines.h"
  17 #include "R600InstrInfo.h"
  18 #include "R600MachineFunctionInfo.h"
  19 #include "llvm/CodeGen/CallingConvLower.h"
  20 #include "llvm/CodeGen/MachineFrameInfo.h"
  21 #include "llvm/CodeGen/MachineInstrBuilder.h"
  22 #include "llvm/CodeGen/MachineRegisterInfo.h"
  23 #include "llvm/CodeGen/SelectionDAG.h"
  24 #include "llvm/IR/Argument.h"
  25 #include "llvm/IR/Function.h"
  26
  27 using namespace llvm;
  28
  29 R600TargetLowering::R600TargetLowering(TargetMachine &TM) :
  30     AMDGPUTargetLowering(TM),
  31     Gen(TM.getSubtarget<AMDGPUSubtarget>().getGeneration()) {
  32   addRegisterClass(MVT::v4f32, &AMDGPU::R600_Reg128RegClass);
  33   addRegisterClass(MVT::f32, &AMDGPU::R600_Reg32RegClass);
  34   addRegisterClass(MVT::v4i32, &AMDGPU::R600_Reg128RegClass);
  35   addRegisterClass(MVT::i32, &AMDGPU::R600_Reg32RegClass);
  36   addRegisterClass(MVT::v2f32, &AMDGPU::R600_Reg64RegClass);
  37   addRegisterClass(MVT::v2i32, &AMDGPU::R600_Reg64RegClass);
  38
  39   computeRegisterProperties();
  40
  41   // Set condition code actions
  42   setCondCodeAction(ISD::SETO,   MVT::f32, Expand);
  43   setCondCodeAction(ISD::SETUO,  MVT::f32, Expand);
  44   setCondCodeAction(ISD::SETLT,  MVT::f32, Expand);
  45   setCondCodeAction(ISD::SETLE,  MVT::f32, Expand);
  46   setCondCodeAction(ISD::SETOLT, MVT::f32, Expand);
  47   setCondCodeAction(ISD::SETOLE, MVT::f32, Expand);
  48   setCondCodeAction(ISD::SETONE, MVT::f32, Expand);
  49   setCondCodeAction(ISD::SETUEQ, MVT::f32, Expand);
  50   setCondCodeAction(ISD::SETUGE, MVT::f32, Expand);
  51   setCondCodeAction(ISD::SETUGT, MVT::f32, Expand);
  52   setCondCodeAction(ISD::SETULT, MVT::f32, Expand);
  53   setCondCodeAction(ISD::SETULE, MVT::f32, Expand);
  54
  55   setCondCodeAction(ISD::SETLE, MVT::i32, Expand);
  56   setCondCodeAction(ISD::SETLT, MVT::i32, Expand);
  57   setCondCodeAction(ISD::SETULE, MVT::i32, Expand);
  58   setCondCodeAction(ISD::SETULT, MVT::i32, Expand);
  59
  60   setOperationAction(ISD::FCOS, MVT::f32, Custom);
  61   setOperationAction(ISD::FSIN, MVT::f32, Custom);
  62
  63   setOperationAction(ISD::SETCC, MVT::v4i32, Expand);
  64   setOperationAction(ISD::SETCC, MVT::v2i32, Expand);
  65
  66   setOperationAction(ISD::BR_CC, MVT::i32, Expand);
  67   setOperationAction(ISD::BR_CC, MVT::f32, Expand);
  68
  69   setOperationAction(ISD::FSUB, MVT::f32, Expand);
  70
  71   setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
  72   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
  73   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i1, Custom);
  74
  75   setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
  76   setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
  77
  78   setOperationAction(ISD::SETCC, MVT::i32, Expand);
  79   setOperationAction(ISD::SETCC, MVT::f32, Expand);
  80   setOperationAction(ISD::FP_TO_UINT, MVT::i1, Custom);
  81
  82   setOperationAction(ISD::SELECT, MVT::i32, Expand);
  83   setOperationAction(ISD::SELECT, MVT::f32, Expand);
  84   setOperationAction(ISD::SELECT, MVT::v2i32, Expand);
  85   setOperationAction(ISD::SELECT, MVT::v2f32, Expand);
  86   setOperationAction(ISD::SELECT, MVT::v4i32, Expand);
  87   setOperationAction(ISD::SELECT, MVT::v4f32, Expand);
  88
  89   // Legalize loads and stores to the private address space.
  90   setOperationAction(ISD::LOAD, MVT::i32, Custom);
  91   setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
  92   setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
  93
  94   // EXTLOAD should be the same as ZEXTLOAD. It is legal for some address
  95   // spaces, so it is custom lowered to handle those where it isn't.
  96   setLoadExtAction(ISD::SEXTLOAD, MVT::i8, Custom);
  97   setLoadExtAction(ISD::SEXTLOAD, MVT::i16, Custom);
  98   setLoadExtAction(ISD::ZEXTLOAD, MVT::i8, Custom);
  99   setLoadExtAction(ISD::ZEXTLOAD, MVT::i16, Custom);
 100   setLoadExtAction(ISD::EXTLOAD, MVT::i8, Custom);
 101   setLoadExtAction(ISD::EXTLOAD, MVT::i16, Custom);
 102
 103   setOperationAction(ISD::STORE, MVT::i8, Custom);
 104   setOperationAction(ISD::STORE, MVT::i32, Custom);
 105   setOperationAction(ISD::STORE, MVT::v2i32, Custom);
 106   setOperationAction(ISD::STORE, MVT::v4i32, Custom);
 107   setTruncStoreAction(MVT::i32, MVT::i8, Custom);
 108   setTruncStoreAction(MVT::i32, MVT::i16, Custom);
 109
 110   setOperationAction(ISD::LOAD, MVT::i32, Custom);
 111   setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
 112   setOperationAction(ISD::FrameIndex, MVT::i32, Custom);
 113
 114   setTargetDAGCombine(ISD::FP_ROUND);
 115   setTargetDAGCombine(ISD::FP_TO_SINT);
 116   setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
 117   setTargetDAGCombine(ISD::SELECT_CC);
 118   setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
 119
 120   setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
 121
 122   setBooleanContents(ZeroOrNegativeOneBooleanContent);
 123   setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
 124   setSchedulingPreference(Sched::Source);
 125 }
 126
 127 MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter(
 128     MachineInstr * MI, MachineBasicBlock * BB) const {
 129   MachineFunction * MF = BB->getParent();
 130   MachineRegisterInfo &MRI = MF->getRegInfo();
 131   MachineBasicBlock::iterator I = *MI;
 132   const R600InstrInfo *TII =
 133     static_cast<const R600InstrInfo*>(MF->getTarget().getInstrInfo());
 134
 135   switch (MI->getOpcode()) {
 136   default:
 137     // Replace LDS_*_RET instruction that don't have any uses with the
 138     // equivalent LDS_*_NORET instruction.
 139     if (TII->isLDSRetInstr(MI->getOpcode())) {
 140       int DstIdx = TII->getOperandIdx(MI->getOpcode(), AMDGPU::OpName::dst);
 141       assert(DstIdx != -1);
 142       MachineInstrBuilder NewMI;
 143       if (!MRI.use_empty(MI->getOperand(DstIdx).getReg()))
 144         return BB;
 145
 146       NewMI = BuildMI(*BB, I, BB->findDebugLoc(I),
 147                       TII->get(AMDGPU::getLDSNoRetOp(MI->getOpcode())));
 148       for (unsigned i = 1, e = MI->getNumOperands(); i < e; ++i) {
 149         NewMI.addOperand(MI->getOperand(i));
 150       }
 151     } else {
 152       return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
 153     }
 154     break;
 155   case AMDGPU::CLAMP_R600: {
 156     MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
 157                                                    AMDGPU::MOV,
 158                                                    MI->getOperand(0).getReg(),
 159                                                    MI->getOperand(1).getReg());
 160     TII->addFlag(NewMI, 0, MO_FLAG_CLAMP);
 161     break;
 162   }
 163
 164   case AMDGPU::FABS_R600: {
 165     MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
 166                                                     AMDGPU::MOV,
 167                                                     MI->getOperand(0).getReg(),
 168                                                     MI->getOperand(1).getReg());
 169     TII->addFlag(NewMI, 0, MO_FLAG_ABS);
 170     break;
 171   }
 172
 173   case AMDGPU::FNEG_R600: {
 174     MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
 175                                                     AMDGPU::MOV,
 176                                                     MI->getOperand(0).getReg(),
 177                                                     MI->getOperand(1).getReg());
 178     TII->addFlag(NewMI, 0, MO_FLAG_NEG);
 179     break;
 180   }
 181
 182   case AMDGPU::MASK_WRITE: {
 183     unsigned maskedRegister = MI->getOperand(0).getReg();
 184     assert(TargetRegisterInfo::isVirtualRegister(maskedRegister));
 185     MachineInstr * defInstr = MRI.getVRegDef(maskedRegister);
 186     TII->addFlag(defInstr, 0, MO_FLAG_MASK);
 187     break;
 188   }
 189
 190   case AMDGPU::MOV_IMM_F32:
 191     TII->buildMovImm(*BB, I, MI->getOperand(0).getReg(),
 192                      MI->getOperand(1).getFPImm()->getValueAPF()
 193                          .bitcastToAPInt().getZExtValue());
 194     break;
 195   case AMDGPU::MOV_IMM_I32:
 196     TII->buildMovImm(*BB, I, MI->getOperand(0).getReg(),
 197                      MI->getOperand(1).getImm());
 198     break;
 199   case AMDGPU::CONST_COPY: {
 200     MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, MI, AMDGPU::MOV,
 201         MI->getOperand(0).getReg(), AMDGPU::ALU_CONST);
 202     TII->setImmOperand(NewMI, AMDGPU::OpName::src0_sel,
 203         MI->getOperand(1).getImm());
 204     break;
 205   }
 206
 207   case AMDGPU::RAT_WRITE_CACHELESS_32_eg:
 208   case AMDGPU::RAT_WRITE_CACHELESS_64_eg:
 209   case AMDGPU::RAT_WRITE_CACHELESS_128_eg: {
 210     unsigned EOP = (std::next(I)->getOpcode() == AMDGPU::RETURN) ? 1 : 0;
 211
 212     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode()))
 213             .addOperand(MI->getOperand(0))
 214             .addOperand(MI->getOperand(1))
 215             .addImm(EOP); // Set End of program bit
 216     break;
 217   }
 218
 219   case AMDGPU::TXD: {
 220     unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
 221     unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
 222     MachineOperand &RID = MI->getOperand(4);
 223     MachineOperand &SID = MI->getOperand(5);
 224     unsigned TextureId = MI->getOperand(6).getImm();
 225     unsigned SrcX = 0, SrcY = 1, SrcZ = 2, SrcW = 3;
 226     unsigned CTX = 1, CTY = 1, CTZ = 1, CTW = 1;
 227
 228     switch (TextureId) {
 229     case 5: // Rect
 230       CTX = CTY = 0;
 231       break;
 232     case 6: // Shadow1D
 233       SrcW = SrcZ;
 234       break;
 235     case 7: // Shadow2D
 236       SrcW = SrcZ;
 237       break;
 238     case 8: // ShadowRect
 239       CTX = CTY = 0;
 240       SrcW = SrcZ;
 241       break;
 242     case 9: // 1DArray
 243       SrcZ = SrcY;
 244       CTZ = 0;
 245       break;
 246     case 10: // 2DArray
 247       CTZ = 0;
 248       break;
 249     case 11: // Shadow1DArray
 250       SrcZ = SrcY;
 251       CTZ = 0;
 252       break;
 253     case 12: // Shadow2DArray
 254       CTZ = 0;
 255       break;
 256     }
 257     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), T0)
 258             .addOperand(MI->getOperand(3))
 259             .addImm(SrcX)
 260             .addImm(SrcY)
 261             .addImm(SrcZ)
 262             .addImm(SrcW)
 263             .addImm(0)
 264             .addImm(0)
 265             .addImm(0)
 266             .addImm(0)
 267             .addImm(1)
 268             .addImm(2)
 269             .addImm(3)
 270             .addOperand(RID)
 271             .addOperand(SID)
 272             .addImm(CTX)
 273             .addImm(CTY)
 274             .addImm(CTZ)
 275             .addImm(CTW);
 276     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), T1)
 277             .addOperand(MI->getOperand(2))
 278             .addImm(SrcX)
 279             .addImm(SrcY)
 280             .addImm(SrcZ)
 281             .addImm(SrcW)
 282             .addImm(0)
 283             .addImm(0)
 284             .addImm(0)
 285             .addImm(0)
 286             .addImm(1)
 287             .addImm(2)
 288             .addImm(3)
 289             .addOperand(RID)
 290             .addOperand(SID)
 291             .addImm(CTX)
 292             .addImm(CTY)
 293             .addImm(CTZ)
 294             .addImm(CTW);
 295     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_G))
 296             .addOperand(MI->getOperand(0))
 297             .addOperand(MI->getOperand(1))
 298             .addImm(SrcX)
 299             .addImm(SrcY)
 300             .addImm(SrcZ)
 301             .addImm(SrcW)
 302             .addImm(0)
 303             .addImm(0)
 304             .addImm(0)
 305             .addImm(0)
 306             .addImm(1)
 307             .addImm(2)
 308             .addImm(3)
 309             .addOperand(RID)
 310             .addOperand(SID)
 311             .addImm(CTX)
 312             .addImm(CTY)
 313             .addImm(CTZ)
 314             .addImm(CTW)
 315             .addReg(T0, RegState::Implicit)
 316             .addReg(T1, RegState::Implicit);
 317     break;
 318   }
 319
 320   case AMDGPU::TXD_SHADOW: {
 321     unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
 322     unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
 323     MachineOperand &RID = MI->getOperand(4);
 324     MachineOperand &SID = MI->getOperand(5);
 325     unsigned TextureId = MI->getOperand(6).getImm();
 326     unsigned SrcX = 0, SrcY = 1, SrcZ = 2, SrcW = 3;
 327     unsigned CTX = 1, CTY = 1, CTZ = 1, CTW = 1;
 328
 329     switch (TextureId) {
 330     case 5: // Rect
 331       CTX = CTY = 0;
 332       break;
 333     case 6: // Shadow1D
 334       SrcW = SrcZ;
 335       break;
 336     case 7: // Shadow2D
 337       SrcW = SrcZ;
 338       break;
 339     case 8: // ShadowRect
 340       CTX = CTY = 0;
 341       SrcW = SrcZ;
 342       break;
 343     case 9: // 1DArray
 344       SrcZ = SrcY;
 345       CTZ = 0;
 346       break;
 347     case 10: // 2DArray
 348       CTZ = 0;
 349       break;
 350     case 11: // Shadow1DArray
 351       SrcZ = SrcY;
 352       CTZ = 0;
 353       break;
 354     case 12: // Shadow2DArray
 355       CTZ = 0;
 356       break;
 357     }
 358
 359     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), T0)
 360             .addOperand(MI->getOperand(3))
 361             .addImm(SrcX)
 362             .addImm(SrcY)
 363             .addImm(SrcZ)
 364             .addImm(SrcW)
 365             .addImm(0)
 366             .addImm(0)
 367             .addImm(0)
 368             .addImm(0)
 369             .addImm(1)
 370             .addImm(2)
 371             .addImm(3)
 372             .addOperand(RID)
 373             .addOperand(SID)
 374             .addImm(CTX)
 375             .addImm(CTY)
 376             .addImm(CTZ)
 377             .addImm(CTW);
 378     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), T1)
 379             .addOperand(MI->getOperand(2))
 380             .addImm(SrcX)
 381             .addImm(SrcY)
 382             .addImm(SrcZ)
 383             .addImm(SrcW)
 384             .addImm(0)
 385             .addImm(0)
 386             .addImm(0)
 387             .addImm(0)
 388             .addImm(1)
 389             .addImm(2)
 390             .addImm(3)
 391             .addOperand(RID)
 392             .addOperand(SID)
 393             .addImm(CTX)
 394             .addImm(CTY)
 395             .addImm(CTZ)
 396             .addImm(CTW);
 397     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_C_G))
 398             .addOperand(MI->getOperand(0))
 399             .addOperand(MI->getOperand(1))
 400             .addImm(SrcX)
 401             .addImm(SrcY)
 402             .addImm(SrcZ)
 403             .addImm(SrcW)
 404             .addImm(0)
 405             .addImm(0)
 406             .addImm(0)
 407             .addImm(0)
 408             .addImm(1)
 409             .addImm(2)
 410             .addImm(3)
 411             .addOperand(RID)
 412             .addOperand(SID)
 413             .addImm(CTX)
 414             .addImm(CTY)
 415             .addImm(CTZ)
 416             .addImm(CTW)
 417             .addReg(T0, RegState::Implicit)
 418             .addReg(T1, RegState::Implicit);
 419     break;
 420   }
 421
 422   case AMDGPU::BRANCH:
 423       BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP))
 424               .addOperand(MI->getOperand(0));
 425       break;
 426
 427   case AMDGPU::BRANCH_COND_f32: {
 428     MachineInstr *NewMI =
 429       BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X),
 430               AMDGPU::PREDICATE_BIT)
 431               .addOperand(MI->getOperand(1))
 432               .addImm(OPCODE_IS_NOT_ZERO)
 433               .addImm(0); // Flags
 434     TII->addFlag(NewMI, 0, MO_FLAG_PUSH);
 435     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND))
 436             .addOperand(MI->getOperand(0))
 437             .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
 438     break;
 439   }
 440
 441   case AMDGPU::BRANCH_COND_i32: {
 442     MachineInstr *NewMI =
 443       BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X),
 444             AMDGPU::PREDICATE_BIT)
 445             .addOperand(MI->getOperand(1))
 446             .addImm(OPCODE_IS_NOT_ZERO_INT)
 447             .addImm(0); // Flags
 448     TII->addFlag(NewMI, 0, MO_FLAG_PUSH);
 449     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND))
 450            .addOperand(MI->getOperand(0))
 451             .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
 452     break;
 453   }
 454
 455   case AMDGPU::EG_ExportSwz:
 456   case AMDGPU::R600_ExportSwz: {
 457     // Instruction is left unmodified if its not the last one of its type
 458     bool isLastInstructionOfItsType = true;
 459     unsigned InstExportType = MI->getOperand(1).getImm();
 460     for (MachineBasicBlock::iterator NextExportInst = std::next(I),
 461          EndBlock = BB->end(); NextExportInst != EndBlock;
 462          NextExportInst = std::next(NextExportInst)) {
 463       if (NextExportInst->getOpcode() == AMDGPU::EG_ExportSwz ||
 464           NextExportInst->getOpcode() == AMDGPU::R600_ExportSwz) {
 465         unsigned CurrentInstExportType = NextExportInst->getOperand(1)
 466             .getImm();
 467         if (CurrentInstExportType == InstExportType) {
 468           isLastInstructionOfItsType = false;
 469           break;
 470         }
 471       }
 472     }
 473     bool EOP = (std::next(I)->getOpcode() == AMDGPU::RETURN) ? 1 : 0;
 474     if (!EOP && !isLastInstructionOfItsType)
 475       return BB;
 476     unsigned CfInst = (MI->getOpcode() == AMDGPU::EG_ExportSwz)? 84 : 40;
 477     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode()))
 478             .addOperand(MI->getOperand(0))
 479             .addOperand(MI->getOperand(1))
 480             .addOperand(MI->getOperand(2))
 481             .addOperand(MI->getOperand(3))
 482             .addOperand(MI->getOperand(4))
 483             .addOperand(MI->getOperand(5))
 484             .addOperand(MI->getOperand(6))
 485             .addImm(CfInst)
 486             .addImm(EOP);
 487     break;
 488   }
 489   case AMDGPU::RETURN: {
 490     // RETURN instructions must have the live-out registers as implicit uses,
 491     // otherwise they appear dead.
 492     R600MachineFunctionInfo *MFI = MF->getInfo<R600MachineFunctionInfo>();
 493     MachineInstrBuilder MIB(*MF, MI);
 494     for (unsigned i = 0, e = MFI->LiveOuts.size(); i != e; ++i)
 495       MIB.addReg(MFI->LiveOuts[i], RegState::Implicit);
 496     return BB;
 497   }
 498   }
 499
 500   MI->eraseFromParent();
 501   return BB;
 502 }
 503
 504 //===----------------------------------------------------------------------===//
 505 // Custom DAG Lowering Operations
 506 //===----------------------------------------------------------------------===//
 507
 508 SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
 509   MachineFunction &MF = DAG.getMachineFunction();
 510   R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
 511   switch (Op.getOpcode()) {
 512   default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
 513   case ISD::FCOS:
 514   case ISD::FSIN: return LowerTrig(Op, DAG);
 515   case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
 516   case ISD::STORE: return LowerSTORE(Op, DAG);
 517   case ISD::LOAD: return LowerLOAD(Op, DAG);
 518   case ISD::GlobalAddress: return LowerGlobalAddress(MFI, Op, DAG);
 519   case ISD::INTRINSIC_VOID: {
 520     SDValue Chain = Op.getOperand(0);
 521     unsigned IntrinsicID =
 522                          cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
 523     switch (IntrinsicID) {
 524     case AMDGPUIntrinsic::AMDGPU_store_output: {
 525       int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
 526       unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister(RegIndex);
 527       MFI->LiveOuts.push_back(Reg);
 528       return DAG.getCopyToReg(Chain, SDLoc(Op), Reg, Op.getOperand(2));
 529     }
 530     case AMDGPUIntrinsic::R600_store_swizzle: {
 531       const SDValue Args[8] = {
 532         Chain,
 533         Op.getOperand(2), // Export Value
 534         Op.getOperand(3), // ArrayBase
 535         Op.getOperand(4), // Type
 536         DAG.getConstant(0, MVT::i32), // SWZ_X
 537         DAG.getConstant(1, MVT::i32), // SWZ_Y
 538         DAG.getConstant(2, MVT::i32), // SWZ_Z
 539         DAG.getConstant(3, MVT::i32) // SWZ_W
 540       };
 541       return DAG.getNode(AMDGPUISD::EXPORT, SDLoc(Op), Op.getValueType(),
 542           Args, 8);
 543     }
 544
 545     // default for switch(IntrinsicID)
 546     default: break;
 547     }
 548     // break out of case ISD::INTRINSIC_VOID in switch(Op.getOpcode())
 549     break;
 550   }
 551   case ISD::INTRINSIC_WO_CHAIN: {
 552     unsigned IntrinsicID =
 553                          cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
 554     EVT VT = Op.getValueType();
 555     SDLoc DL(Op);
 556     switch(IntrinsicID) {
 557     default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
 558     case AMDGPUIntrinsic::R600_load_input: {
 559       int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
 560       unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister(RegIndex);
 561       MachineFunction &MF = DAG.getMachineFunction();
 562       MachineRegisterInfo &MRI = MF.getRegInfo();
 563       MRI.addLiveIn(Reg);
 564       return DAG.getCopyFromReg(DAG.getEntryNode(),
 565           SDLoc(DAG.getEntryNode()), Reg, VT);
 566     }
 567
 568     case AMDGPUIntrinsic::R600_interp_input: {
 569       int slot = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
 570       int ijb = cast<ConstantSDNode>(Op.getOperand(2))->getSExtValue();
 571       MachineSDNode *interp;
 572       if (ijb < 0) {
 573         const MachineFunction &MF = DAG.getMachineFunction();
 574         const R600InstrInfo *TII =
 575           static_cast<const R600InstrInfo*>(MF.getTarget().getInstrInfo());
 576         interp = DAG.getMachineNode(AMDGPU::INTERP_VEC_LOAD, DL,
 577             MVT::v4f32, DAG.getTargetConstant(slot / 4 , MVT::i32));
 578         return DAG.getTargetExtractSubreg(
 579             TII->getRegisterInfo().getSubRegFromChannel(slot % 4),
 580             DL, MVT::f32, SDValue(interp, 0));
 581       }
 582       MachineFunction &MF = DAG.getMachineFunction();
 583       MachineRegisterInfo &MRI = MF.getRegInfo();
 584       unsigned RegisterI = AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb);
 585       unsigned RegisterJ = AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb + 1);
 586       MRI.addLiveIn(RegisterI);
 587       MRI.addLiveIn(RegisterJ);
 588       SDValue RegisterINode = DAG.getCopyFromReg(DAG.getEntryNode(),
 589           SDLoc(DAG.getEntryNode()), RegisterI, MVT::f32);
 590       SDValue RegisterJNode = DAG.getCopyFromReg(DAG.getEntryNode(),
 591           SDLoc(DAG.getEntryNode()), RegisterJ, MVT::f32);
 592
 593       if (slot % 4 < 2)
 594         interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_XY, DL,
 595             MVT::f32, MVT::f32, DAG.getTargetConstant(slot / 4 , MVT::i32),
 596             RegisterJNode, RegisterINode);
 597       else
 598         interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_ZW, DL,
 599             MVT::f32, MVT::f32, DAG.getTargetConstant(slot / 4 , MVT::i32),
 600             RegisterJNode, RegisterINode);
 601       return SDValue(interp, slot % 2);
 602     }
 603     case AMDGPUIntrinsic::R600_interp_xy:
 604     case AMDGPUIntrinsic::R600_interp_zw: {
 605       int slot = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
 606       MachineSDNode *interp;
 607       SDValue RegisterINode = Op.getOperand(2);
 608       SDValue RegisterJNode = Op.getOperand(3);
 609
 610       if (IntrinsicID == AMDGPUIntrinsic::R600_interp_xy)
 611         interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_XY, DL,
 612             MVT::f32, MVT::f32, DAG.getTargetConstant(slot, MVT::i32),
 613             RegisterJNode, RegisterINode);
 614       else
 615         interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_ZW, DL,
 616             MVT::f32, MVT::f32, DAG.getTargetConstant(slot, MVT::i32),
 617             RegisterJNode, RegisterINode);
 618       return DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2f32,
 619           SDValue(interp, 0), SDValue(interp, 1));
 620     }
 621     case AMDGPUIntrinsic::R600_tex:
 622     case AMDGPUIntrinsic::R600_texc:
 623     case AMDGPUIntrinsic::R600_txl:
 624     case AMDGPUIntrinsic::R600_txlc:
 625     case AMDGPUIntrinsic::R600_txb:
 626     case AMDGPUIntrinsic::R600_txbc:
 627     case AMDGPUIntrinsic::R600_txf:
 628     case AMDGPUIntrinsic::R600_txq:
 629     case AMDGPUIntrinsic::R600_ddx:
 630     case AMDGPUIntrinsic::R600_ddy:
 631     case AMDGPUIntrinsic::R600_ldptr: {
 632       unsigned TextureOp;
 633       switch (IntrinsicID) {
 634       case AMDGPUIntrinsic::R600_tex:
 635         TextureOp = 0;
 636         break;
 637       case AMDGPUIntrinsic::R600_texc:
 638         TextureOp = 1;
 639         break;
 640       case AMDGPUIntrinsic::R600_txl:
 641         TextureOp = 2;
 642         break;
 643       case AMDGPUIntrinsic::R600_txlc:
 644         TextureOp = 3;
 645         break;
 646       case AMDGPUIntrinsic::R600_txb:
 647         TextureOp = 4;
 648         break;
 649       case AMDGPUIntrinsic::R600_txbc:
 650         TextureOp = 5;
 651         break;
 652       case AMDGPUIntrinsic::R600_txf:
 653         TextureOp = 6;
 654         break;
 655       case AMDGPUIntrinsic::R600_txq:
 656         TextureOp = 7;
 657         break;
 658       case AMDGPUIntrinsic::R600_ddx:
 659         TextureOp = 8;
 660         break;
 661       case AMDGPUIntrinsic::R600_ddy:
 662         TextureOp = 9;
 663         break;
 664       case AMDGPUIntrinsic::R600_ldptr:
 665         TextureOp = 10;
 666         break;
 667       default:
 668         llvm_unreachable("Unknow Texture Operation");
 669       }
 670
 671       SDValue TexArgs[19] = {
 672         DAG.getConstant(TextureOp, MVT::i32),
 673         Op.getOperand(1),
 674         DAG.getConstant(0, MVT::i32),
 675         DAG.getConstant(1, MVT::i32),
 676         DAG.getConstant(2, MVT::i32),
 677         DAG.getConstant(3, MVT::i32),
 678         Op.getOperand(2),
 679         Op.getOperand(3),
 680         Op.getOperand(4),
 681         DAG.getConstant(0, MVT::i32),
 682         DAG.getConstant(1, MVT::i32),
 683         DAG.getConstant(2, MVT::i32),
 684         DAG.getConstant(3, MVT::i32),
 685         Op.getOperand(5),
 686         Op.getOperand(6),
 687         Op.getOperand(7),
 688         Op.getOperand(8),
 689         Op.getOperand(9),
 690         Op.getOperand(10)
 691       };
 692       return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, DL, MVT::v4f32, TexArgs, 19);
 693     }
 694     case AMDGPUIntrinsic::AMDGPU_dp4: {
 695       SDValue Args[8] = {
 696       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
 697           DAG.getConstant(0, MVT::i32)),
 698       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
 699           DAG.getConstant(0, MVT::i32)),
 700       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
 701           DAG.getConstant(1, MVT::i32)),
 702       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
 703           DAG.getConstant(1, MVT::i32)),
 704       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
 705           DAG.getConstant(2, MVT::i32)),
 706       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
 707           DAG.getConstant(2, MVT::i32)),
 708       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
 709           DAG.getConstant(3, MVT::i32)),
 710       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
 711           DAG.getConstant(3, MVT::i32))
 712       };
 713       return DAG.getNode(AMDGPUISD::DOT4, DL, MVT::f32, Args, 8);
 714     }
 715
 716     case Intrinsic::r600_read_ngroups_x:
 717       return LowerImplicitParameter(DAG, VT, DL, 0);
 718     case Intrinsic::r600_read_ngroups_y:
 719       return LowerImplicitParameter(DAG, VT, DL, 1);
 720     case Intrinsic::r600_read_ngroups_z:
 721       return LowerImplicitParameter(DAG, VT, DL, 2);
 722     case Intrinsic::r600_read_global_size_x:
 723       return LowerImplicitParameter(DAG, VT, DL, 3);
 724     case Intrinsic::r600_read_global_size_y:
 725       return LowerImplicitParameter(DAG, VT, DL, 4);
 726     case Intrinsic::r600_read_global_size_z:
 727       return LowerImplicitParameter(DAG, VT, DL, 5);
 728     case Intrinsic::r600_read_local_size_x:
 729       return LowerImplicitParameter(DAG, VT, DL, 6);
 730     case Intrinsic::r600_read_local_size_y:
 731       return LowerImplicitParameter(DAG, VT, DL, 7);
 732     case Intrinsic::r600_read_local_size_z:
 733       return LowerImplicitParameter(DAG, VT, DL, 8);
 734
 735     case Intrinsic::r600_read_tgid_x:
 736       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 737                                   AMDGPU::T1_X, VT);
 738     case Intrinsic::r600_read_tgid_y:
 739       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 740                                   AMDGPU::T1_Y, VT);
 741     case Intrinsic::r600_read_tgid_z:
 742       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 743                                   AMDGPU::T1_Z, VT);
 744     case Intrinsic::r600_read_tidig_x:
 745       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 746                                   AMDGPU::T0_X, VT);
 747     case Intrinsic::r600_read_tidig_y:
 748       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 749                                   AMDGPU::T0_Y, VT);
 750     case Intrinsic::r600_read_tidig_z:
 751       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 752                                   AMDGPU::T0_Z, VT);
 753     }
 754     // break out of case ISD::INTRINSIC_WO_CHAIN in switch(Op.getOpcode())
 755     break;
 756   }
 757   } // end switch(Op.getOpcode())
 758   return SDValue();
 759 }
 760
 761 void R600TargetLowering::ReplaceNodeResults(SDNode *N,
 762                                             SmallVectorImpl<SDValue> &Results,
 763                                             SelectionDAG &DAG) const {
 764   switch (N->getOpcode()) {
 765   default: return;
 766   case ISD::FP_TO_UINT: Results.push_back(LowerFPTOUINT(N->getOperand(0), DAG));
 767     return;
 768   case ISD::LOAD: {
 769     SDNode *Node = LowerLOAD(SDValue(N, 0), DAG).getNode();
 770     Results.push_back(SDValue(Node, 0));
 771     Results.push_back(SDValue(Node, 1));
 772     // XXX: LLVM seems not to replace Chain Value inside CustomWidenLowerNode
 773     // function
 774     DAG.ReplaceAllUsesOfValueWith(SDValue(N,1), SDValue(Node, 1));
 775     return;
 776   }
 777   case ISD::STORE:
 778     SDNode *Node = LowerSTORE(SDValue(N, 0), DAG).getNode();
 779     Results.push_back(SDValue(Node, 0));
 780     return;
 781   }
 782 }
 783
 784 SDValue R600TargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
 785   // On hw >= R700, COS/SIN input must be between -1. and 1.
 786   // Thus we lower them to TRIG ( FRACT ( x / 2Pi + 0.5) - 0.5)
 787   EVT VT = Op.getValueType();
 788   SDValue Arg = Op.getOperand(0);
 789   SDValue FractPart = DAG.getNode(AMDGPUISD::FRACT, SDLoc(Op), VT,
 790       DAG.getNode(ISD::FADD, SDLoc(Op), VT,
 791         DAG.getNode(ISD::FMUL, SDLoc(Op), VT, Arg,
 792           DAG.getConstantFP(0.15915494309, MVT::f32)),
 793         DAG.getConstantFP(0.5, MVT::f32)));
 794   unsigned TrigNode;
 795   switch (Op.getOpcode()) {
 796   case ISD::FCOS:
 797     TrigNode = AMDGPUISD::COS_HW;
 798     break;
 799   case ISD::FSIN:
 800     TrigNode = AMDGPUISD::SIN_HW;
 801     break;
 802   default:
 803     llvm_unreachable("Wrong trig opcode");
 804   }
 805   SDValue TrigVal = DAG.getNode(TrigNode, SDLoc(Op), VT,
 806       DAG.getNode(ISD::FADD, SDLoc(Op), VT, FractPart,
 807         DAG.getConstantFP(-0.5, MVT::f32)));
 808   if (Gen >= AMDGPUSubtarget::R700)
 809     return TrigVal;
 810   // On R600 hw, COS/SIN input must be between -Pi and Pi.
 811   return DAG.getNode(ISD::FMUL, SDLoc(Op), VT, TrigVal,
 812       DAG.getConstantFP(3.14159265359, MVT::f32));
 813 }
 814
 815 SDValue R600TargetLowering::LowerFPTOUINT(SDValue Op, SelectionDAG &DAG) const {
 816   return DAG.getNode(
 817       ISD::SETCC,
 818       SDLoc(Op),
 819       MVT::i1,
 820       Op, DAG.getConstantFP(0.0f, MVT::f32),
 821       DAG.getCondCode(ISD::SETNE)
 822       );
 823 }
 824
 825 SDValue R600TargetLowering::LowerImplicitParameter(SelectionDAG &DAG, EVT VT,
 826                                                    SDLoc DL,
 827                                                    unsigned DwordOffset) const {
 828   unsigned ByteOffset = DwordOffset * 4;
 829   PointerType * PtrType = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
 830                                       AMDGPUAS::CONSTANT_BUFFER_0);
 831
 832   // We shouldn't be using an offset wider than 16-bits for implicit parameters.
 833   assert(isInt<16>(ByteOffset));
 834
 835   return DAG.getLoad(VT, DL, DAG.getEntryNode(),
 836                      DAG.getConstant(ByteOffset, MVT::i32), // PTR
 837                      MachinePointerInfo(ConstantPointerNull::get(PtrType)),
 838                      false, false, false, 0);
 839 }
 840
 841 bool R600TargetLowering::isZero(SDValue Op) const {
 842   if(ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
 843     return Cst->isNullValue();
 844   } else if(ConstantFPSDNode *CstFP = dyn_cast<ConstantFPSDNode>(Op)){
 845     return CstFP->isZero();
 846   } else {
 847     return false;
 848   }
 849 }
 850
 851 SDValue R600TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
 852   SDLoc DL(Op);
 853   EVT VT = Op.getValueType();
 854
 855   SDValue LHS = Op.getOperand(0);
 856   SDValue RHS = Op.getOperand(1);
 857   SDValue True = Op.getOperand(2);
 858   SDValue False = Op.getOperand(3);
 859   SDValue CC = Op.getOperand(4);
 860   SDValue Temp;
 861
 862   // LHS and RHS are guaranteed to be the same value type
 863   EVT CompareVT = LHS.getValueType();
 864
 865   // Check if we can lower this to a native operation.
 866
 867   // Try to lower to a SET* instruction:
 868   //
 869   // SET* can match the following patterns:
 870   //
 871   // select_cc f32, f32, -1,  0, cc_supported
 872   // select_cc f32, f32, 1.0f, 0.0f, cc_supported
 873   // select_cc i32, i32, -1,  0, cc_supported
 874   //
 875
 876   // Move hardware True/False values to the correct operand.
 877   ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
 878   ISD::CondCode InverseCC =
 879      ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32);
 880   if (isHWTrueValue(False) && isHWFalseValue(True)) {
 881     if (isCondCodeLegal(InverseCC, CompareVT.getSimpleVT())) {
 882       std::swap(False, True);
 883       CC = DAG.getCondCode(InverseCC);
 884     } else {
 885       ISD::CondCode SwapInvCC = ISD::getSetCCSwappedOperands(InverseCC);
 886       if (isCondCodeLegal(SwapInvCC, CompareVT.getSimpleVT())) {
 887         std::swap(False, True);
 888         std::swap(LHS, RHS);
 889         CC = DAG.getCondCode(SwapInvCC);
 890       }
 891     }
 892   }
 893
 894   if (isHWTrueValue(True) && isHWFalseValue(False) &&
 895       (CompareVT == VT || VT == MVT::i32)) {
 896     // This can be matched by a SET* instruction.
 897     return DAG.getNode(ISD::SELECT_CC, DL, VT, LHS, RHS, True, False, CC);
 898   }
 899
 900   // Try to lower to a CND* instruction:
 901   //
 902   // CND* can match the following patterns:
 903   //
 904   // select_cc f32, 0.0, f32, f32, cc_supported
 905   // select_cc f32, 0.0, i32, i32, cc_supported
 906   // select_cc i32, 0,   f32, f32, cc_supported
 907   // select_cc i32, 0,   i32, i32, cc_supported
 908   //
 909
 910   // Try to move the zero value to the RHS
 911   if (isZero(LHS)) {
 912     ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
 913     // Try swapping the operands
 914     ISD::CondCode CCSwapped = ISD::getSetCCSwappedOperands(CCOpcode);
 915     if (isCondCodeLegal(CCSwapped, CompareVT.getSimpleVT())) {
 916       std::swap(LHS, RHS);
 917       CC = DAG.getCondCode(CCSwapped);
 918     } else {
 919       // Try inverting the conditon and then swapping the operands
 920       ISD::CondCode CCInv = ISD::getSetCCInverse(CCOpcode, CompareVT.isInteger());
 921       CCSwapped = ISD::getSetCCSwappedOperands(CCInv);
 922       if (isCondCodeLegal(CCSwapped, CompareVT.getSimpleVT())) {
 923         std::swap(True, False);
 924         std::swap(LHS, RHS);
 925         CC = DAG.getCondCode(CCSwapped);
 926       }
 927     }
 928   }
 929   if (isZero(RHS)) {
 930     SDValue Cond = LHS;
 931     SDValue Zero = RHS;
 932     ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
 933     if (CompareVT != VT) {
 934       // Bitcast True / False to the correct types.  This will end up being
 935       // a nop, but it allows us to define only a single pattern in the
 936       // .TD files for each CND* instruction rather than having to have
 937       // one pattern for integer True/False and one for fp True/False
 938       True = DAG.getNode(ISD::BITCAST, DL, CompareVT, True);
 939       False = DAG.getNode(ISD::BITCAST, DL, CompareVT, False);
 940     }
 941
 942     switch (CCOpcode) {
 943     case ISD::SETONE:
 944     case ISD::SETUNE:
 945     case ISD::SETNE:
 946       CCOpcode = ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32);
 947       Temp = True;
 948       True = False;
 949       False = Temp;
 950       break;
 951     default:
 952       break;
 953     }
 954     SDValue SelectNode = DAG.getNode(ISD::SELECT_CC, DL, CompareVT,
 955         Cond, Zero,
 956         True, False,
 957         DAG.getCondCode(CCOpcode));
 958     return DAG.getNode(ISD::BITCAST, DL, VT, SelectNode);
 959   }
 960
 961
 962   // Possible Min/Max pattern
 963   SDValue MinMax = LowerMinMax(Op, DAG);
 964   if (MinMax.getNode()) {
 965     return MinMax;
 966   }
 967
 968   // If we make it this for it means we have no native instructions to handle
 969   // this SELECT_CC, so we must lower it.
 970   SDValue HWTrue, HWFalse;
 971
 972   if (CompareVT == MVT::f32) {
 973     HWTrue = DAG.getConstantFP(1.0f, CompareVT);
 974     HWFalse = DAG.getConstantFP(0.0f, CompareVT);
 975   } else if (CompareVT == MVT::i32) {
 976     HWTrue = DAG.getConstant(-1, CompareVT);
 977     HWFalse = DAG.getConstant(0, CompareVT);
 978   }
 979   else {
 980     llvm_unreachable("Unhandled value type in LowerSELECT_CC");
 981   }
 982
 983   // Lower this unsupported SELECT_CC into a combination of two supported
 984   // SELECT_CC operations.
 985   SDValue Cond = DAG.getNode(ISD::SELECT_CC, DL, CompareVT, LHS, RHS, HWTrue, HWFalse, CC);
 986
 987   return DAG.getNode(ISD::SELECT_CC, DL, VT,
 988       Cond, HWFalse,
 989       True, False,
 990       DAG.getCondCode(ISD::SETNE));
 991 }
 992
 993 /// LLVM generates byte-addressed pointers.  For indirect addressing, we need to
 994 /// convert these pointers to a register index.  Each register holds
 995 /// 16 bytes, (4 x 32bit sub-register), but we need to take into account the
 996 /// \p StackWidth, which tells us how many of the 4 sub-registrers will be used
 997 /// for indirect addressing.
 998 SDValue R600TargetLowering::stackPtrToRegIndex(SDValue Ptr,
 999                                                unsigned StackWidth,
1000                                                SelectionDAG &DAG) const {
1001   unsigned SRLPad;
1002   switch(StackWidth) {
1003   case 1:
1004     SRLPad = 2;
1005     break;
1006   case 2:
1007     SRLPad = 3;
1008     break;
1009   case 4:
1010     SRLPad = 4;
1011     break;
1012   default: llvm_unreachable("Invalid stack width");
1013   }
1014
1015   return DAG.getNode(ISD::SRL, SDLoc(Ptr), Ptr.getValueType(), Ptr,
1016                      DAG.getConstant(SRLPad, MVT::i32));
1017 }
1018
1019 void R600TargetLowering::getStackAddress(unsigned StackWidth,
1020                                          unsigned ElemIdx,
1021                                          unsigned &Channel,
1022                                          unsigned &PtrIncr) const {
1023   switch (StackWidth) {
1024   default:
1025   case 1:
1026     Channel = 0;
1027     if (ElemIdx > 0) {
1028       PtrIncr = 1;
1029     } else {
1030       PtrIncr = 0;
1031     }
1032     break;
1033   case 2:
1034     Channel = ElemIdx % 2;
1035     if (ElemIdx == 2) {
1036       PtrIncr = 1;
1037     } else {
1038       PtrIncr = 0;
1039     }
1040     break;
1041   case 4:
1042     Channel = ElemIdx;
1043     PtrIncr = 0;
1044     break;
1045   }
1046 }
1047
1048 SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
1049   SDLoc DL(Op);
1050   StoreSDNode *StoreNode = cast<StoreSDNode>(Op);
1051   SDValue Chain = Op.getOperand(0);
1052   SDValue Value = Op.getOperand(1);
1053   SDValue Ptr = Op.getOperand(2);
1054
1055   SDValue Result = AMDGPUTargetLowering::LowerSTORE(Op, DAG);
1056   if (Result.getNode()) {
1057     return Result;
1058   }
1059
1060   if (StoreNode->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS) {
1061     if (StoreNode->isTruncatingStore()) {
1062       EVT VT = Value.getValueType();
1063       assert(VT.bitsLE(MVT::i32));
1064       EVT MemVT = StoreNode->getMemoryVT();
1065       SDValue MaskConstant;
1066       if (MemVT == MVT::i8) {
1067         MaskConstant = DAG.getConstant(0xFF, MVT::i32);
1068       } else {
1069         assert(MemVT == MVT::i16);
1070         MaskConstant = DAG.getConstant(0xFFFF, MVT::i32);
1071       }
1072       SDValue DWordAddr = DAG.getNode(ISD::SRL, DL, VT, Ptr,
1073                                       DAG.getConstant(2, MVT::i32));
1074       SDValue ByteIndex = DAG.getNode(ISD::AND, DL, Ptr.getValueType(), Ptr,
1075                                       DAG.getConstant(0x00000003, VT));
1076       SDValue TruncValue = DAG.getNode(ISD::AND, DL, VT, Value, MaskConstant);
1077       SDValue Shift = DAG.getNode(ISD::SHL, DL, VT, ByteIndex,
1078                                    DAG.getConstant(3, VT));
1079       SDValue ShiftedValue = DAG.getNode(ISD::SHL, DL, VT, TruncValue, Shift);
1080       SDValue Mask = DAG.getNode(ISD::SHL, DL, VT, MaskConstant, Shift);
1081       // XXX: If we add a 64-bit ZW register class, then we could use a 2 x i32
1082       // vector instead.
1083       SDValue Src[4] = {
1084         ShiftedValue,
1085         DAG.getConstant(0, MVT::i32),
1086         DAG.getConstant(0, MVT::i32),
1087         Mask
1088       };
1089       SDValue Input = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v4i32, Src, 4);
1090       SDValue Args[3] = { Chain, Input, DWordAddr };
1091       return DAG.getMemIntrinsicNode(AMDGPUISD::STORE_MSKOR, DL,
1092                                      Op->getVTList(), Args, 3, MemVT,
1093                                      StoreNode->getMemOperand());
1094     } else if (Ptr->getOpcode() != AMDGPUISD::DWORDADDR &&
1095                Value.getValueType().bitsGE(MVT::i32)) {
1096       // Convert pointer from byte address to dword address.
1097       Ptr = DAG.getNode(AMDGPUISD::DWORDADDR, DL, Ptr.getValueType(),
1098                         DAG.getNode(ISD::SRL, DL, Ptr.getValueType(),
1099                                     Ptr, DAG.getConstant(2, MVT::i32)));
1100
1101       if (StoreNode->isTruncatingStore() || StoreNode->isIndexed()) {
1102         llvm_unreachable("Truncated and indexed stores not supported yet");
1103       } else {
1104         Chain = DAG.getStore(Chain, DL, Value, Ptr, StoreNode->getMemOperand());
1105       }
1106       return Chain;
1107     }
1108   }
1109
1110   EVT ValueVT = Value.getValueType();
1111
1112   if (StoreNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) {
1113     return SDValue();
1114   }
1115
1116   SDValue Ret = AMDGPUTargetLowering::LowerSTORE(Op, DAG);
1117   if (Ret.getNode()) {
1118     return Ret;
1119   }
1120   // Lowering for indirect addressing
1121
1122   const MachineFunction &MF = DAG.getMachineFunction();
1123   const AMDGPUFrameLowering *TFL = static_cast<const AMDGPUFrameLowering*>(
1124                                          getTargetMachine().getFrameLowering());
1125   unsigned StackWidth = TFL->getStackWidth(MF);
1126
1127   Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG);
1128
1129   if (ValueVT.isVector()) {
1130     unsigned NumElemVT = ValueVT.getVectorNumElements();
1131     EVT ElemVT = ValueVT.getVectorElementType();
1132     SDValue Stores[4];
1133
1134     assert(NumElemVT >= StackWidth && "Stack width cannot be greater than "
1135                                       "vector width in load");
1136
1137     for (unsigned i = 0; i < NumElemVT; ++i) {
1138       unsigned Channel, PtrIncr;
1139       getStackAddress(StackWidth, i, Channel, PtrIncr);
1140       Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr,
1141                         DAG.getConstant(PtrIncr, MVT::i32));
1142       SDValue Elem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ElemVT,
1143                                  Value, DAG.getConstant(i, MVT::i32));
1144
1145       Stores[i] = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other,
1146                               Chain, Elem, Ptr,
1147                               DAG.getTargetConstant(Channel, MVT::i32));
1148     }
1149      Chain =  DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores, NumElemVT);
1150    } else {
1151     if (ValueVT == MVT::i8) {
1152       Value = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Value);
1153     }
1154     Chain = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other, Chain, Value, Ptr,
1155     DAG.getTargetConstant(0, MVT::i32)); // Channel
1156   }
1157
1158   return Chain;
1159 }
1160
1161 // return (512 + (kc_bank << 12)
1162 static int
1163 ConstantAddressBlock(unsigned AddressSpace) {
1164   switch (AddressSpace) {
1165   case AMDGPUAS::CONSTANT_BUFFER_0:
1166     return 512;
1167   case AMDGPUAS::CONSTANT_BUFFER_1:
1168     return 512 + 4096;
1169   case AMDGPUAS::CONSTANT_BUFFER_2:
1170     return 512 + 4096 * 2;
1171   case AMDGPUAS::CONSTANT_BUFFER_3:
1172     return 512 + 4096 * 3;
1173   case AMDGPUAS::CONSTANT_BUFFER_4:
1174     return 512 + 4096 * 4;
1175   case AMDGPUAS::CONSTANT_BUFFER_5:
1176     return 512 + 4096 * 5;
1177   case AMDGPUAS::CONSTANT_BUFFER_6:
1178     return 512 + 4096 * 6;
1179   case AMDGPUAS::CONSTANT_BUFFER_7:
1180     return 512 + 4096 * 7;
1181   case AMDGPUAS::CONSTANT_BUFFER_8:
1182     return 512 + 4096 * 8;
1183   case AMDGPUAS::CONSTANT_BUFFER_9:
1184     return 512 + 4096 * 9;
1185   case AMDGPUAS::CONSTANT_BUFFER_10:
1186     return 512 + 4096 * 10;
1187   case AMDGPUAS::CONSTANT_BUFFER_11:
1188     return 512 + 4096 * 11;
1189   case AMDGPUAS::CONSTANT_BUFFER_12:
1190     return 512 + 4096 * 12;
1191   case AMDGPUAS::CONSTANT_BUFFER_13:
1192     return 512 + 4096 * 13;
1193   case AMDGPUAS::CONSTANT_BUFFER_14:
1194     return 512 + 4096 * 14;
1195   case AMDGPUAS::CONSTANT_BUFFER_15:
1196     return 512 + 4096 * 15;
1197   default:
1198     return -1;
1199   }
1200 }
1201
1202 SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const
1203 {
1204   EVT VT = Op.getValueType();
1205   SDLoc DL(Op);
1206   LoadSDNode *LoadNode = cast<LoadSDNode>(Op);
1207   SDValue Chain = Op.getOperand(0);
1208   SDValue Ptr = Op.getOperand(1);
1209   SDValue LoweredLoad;
1210
1211   SDValue Ret = AMDGPUTargetLowering::LowerLOAD(Op, DAG);
1212   if (Ret.getNode()) {
1213     SDValue Ops[2];
1214     Ops[0] = Ret;
1215     Ops[1] = Chain;
1216     return DAG.getMergeValues(Ops, 2, DL);
1217   }
1218
1219
1220   if (LoadNode->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS && VT.isVector()) {
1221     SDValue MergedValues[2] = {
1222       SplitVectorLoad(Op, DAG),
1223       Chain
1224     };
1225     return DAG.getMergeValues(MergedValues, 2, DL);
1226   }
1227
1228   int ConstantBlock = ConstantAddressBlock(LoadNode->getAddressSpace());
1229   if (ConstantBlock > -1 &&
1230       ((LoadNode->getExtensionType() == ISD::NON_EXTLOAD) ||
1231        (LoadNode->getExtensionType() == ISD::ZEXTLOAD))) {
1232     SDValue Result;
1233     if (isa<ConstantExpr>(LoadNode->getSrcValue()) ||
1234         isa<Constant>(LoadNode->getSrcValue()) ||
1235         isa<ConstantSDNode>(Ptr)) {
1236       SDValue Slots[4];
1237       for (unsigned i = 0; i < 4; i++) {
1238         // We want Const position encoded with the following formula :
1239         // (((512 + (kc_bank << 12) + const_index) << 2) + chan)
1240         // const_index is Ptr computed by llvm using an alignment of 16.
1241         // Thus we add (((512 + (kc_bank << 12)) + chan ) * 4 here and
1242         // then div by 4 at the ISel step
1243         SDValue NewPtr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
1244             DAG.getConstant(4 * i + ConstantBlock * 16, MVT::i32));
1245         Slots[i] = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::i32, NewPtr);
1246       }
1247       EVT NewVT = MVT::v4i32;
1248       unsigned NumElements = 4;
1249       if (VT.isVector()) {
1250         NewVT = VT;
1251         NumElements = VT.getVectorNumElements();
1252       }
1253       Result = DAG.getNode(ISD::BUILD_VECTOR, DL, NewVT, Slots, NumElements);
1254     } else {
1255       // non-constant ptr can't be folded, keeps it as a v4f32 load
1256       Result = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::v4i32,
1257           DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr, DAG.getConstant(4, MVT::i32)),
1258           DAG.getConstant(LoadNode->getAddressSpace() -
1259                           AMDGPUAS::CONSTANT_BUFFER_0, MVT::i32)
1260           );
1261     }
1262
1263     if (!VT.isVector()) {
1264       Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Result,
1265           DAG.getConstant(0, MVT::i32));
1266     }
1267
1268     SDValue MergedValues[2] = {
1269         Result,
1270         Chain
1271     };
1272     return DAG.getMergeValues(MergedValues, 2, DL);
1273   }
1274
1275   // For most operations returning SDValue() will result in the node being
1276   // expanded by the DAG Legalizer. This is not the case for ISD::LOAD, so we
1277   // need to manually expand loads that may be legal in some address spaces and
1278   // illegal in others. SEXT loads from CONSTANT_BUFFER_0 are supported for
1279   // compute shaders, since the data is sign extended when it is uploaded to the
1280   // buffer. However SEXT loads from other address spaces are not supported, so
1281   // we need to expand them here.
1282   if (LoadNode->getExtensionType() == ISD::SEXTLOAD) {
1283     EVT MemVT = LoadNode->getMemoryVT();
1284     assert(!MemVT.isVector() && (MemVT == MVT::i16 || MemVT == MVT::i8));
1285     SDValue ShiftAmount =
1286           DAG.getConstant(VT.getSizeInBits() - MemVT.getSizeInBits(), MVT::i32);
1287     SDValue NewLoad = DAG.getExtLoad(ISD::EXTLOAD, DL, VT, Chain, Ptr,
1288                                   LoadNode->getPointerInfo(), MemVT,
1289                                   LoadNode->isVolatile(),
1290                                   LoadNode->isNonTemporal(),
1291                                   LoadNode->getAlignment());
1292     SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, NewLoad, ShiftAmount);
1293     SDValue Sra = DAG.getNode(ISD::SRA, DL, VT, Shl, ShiftAmount);
1294
1295     SDValue MergedValues[2] = { Sra, Chain };
1296     return DAG.getMergeValues(MergedValues, 2, DL);
1297   }
1298
1299   if (LoadNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) {
1300     return SDValue();
1301   }
1302
1303   // Lowering for indirect addressing
1304   const MachineFunction &MF = DAG.getMachineFunction();
1305   const AMDGPUFrameLowering *TFL = static_cast<const AMDGPUFrameLowering*>(
1306                                          getTargetMachine().getFrameLowering());
1307   unsigned StackWidth = TFL->getStackWidth(MF);
1308
1309   Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG);
1310
1311   if (VT.isVector()) {
1312     unsigned NumElemVT = VT.getVectorNumElements();
1313     EVT ElemVT = VT.getVectorElementType();
1314     SDValue Loads[4];
1315
1316     assert(NumElemVT >= StackWidth && "Stack width cannot be greater than "
1317                                       "vector width in load");
1318
1319     for (unsigned i = 0; i < NumElemVT; ++i) {
1320       unsigned Channel, PtrIncr;
1321       getStackAddress(StackWidth, i, Channel, PtrIncr);
1322       Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr,
1323                         DAG.getConstant(PtrIncr, MVT::i32));
1324       Loads[i] = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, ElemVT,
1325                              Chain, Ptr,
1326                              DAG.getTargetConstant(Channel, MVT::i32),
1327                              Op.getOperand(2));
1328     }
1329     for (unsigned i = NumElemVT; i < 4; ++i) {
1330       Loads[i] = DAG.getUNDEF(ElemVT);
1331     }
1332     EVT TargetVT = EVT::getVectorVT(*DAG.getContext(), ElemVT, 4);
1333     LoweredLoad = DAG.getNode(ISD::BUILD_VECTOR, DL, TargetVT, Loads, 4);
1334   } else {
1335     LoweredLoad = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, VT,
1336                               Chain, Ptr,
1337                               DAG.getTargetConstant(0, MVT::i32), // Channel
1338                               Op.getOperand(2));
1339   }
1340
1341   SDValue Ops[2];
1342   Ops[0] = LoweredLoad;
1343   Ops[1] = Chain;
1344
1345   return DAG.getMergeValues(Ops, 2, DL);
1346 }
1347
1348 /// XXX Only kernel functions are supported, so we can assume for now that
1349 /// every function is a kernel function, but in the future we should use
1350 /// separate calling conventions for kernel and non-kernel functions.
1351 SDValue R600TargetLowering::LowerFormalArguments(
1352                                       SDValue Chain,
1353                                       CallingConv::ID CallConv,
1354                                       bool isVarArg,
1355                                       const SmallVectorImpl<ISD::InputArg> &Ins,
1356                                       SDLoc DL, SelectionDAG &DAG,
1357                                       SmallVectorImpl<SDValue> &InVals) const {
1358   SmallVector<CCValAssign, 16> ArgLocs;
1359   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
1360                  getTargetMachine(), ArgLocs, *DAG.getContext());
1361   MachineFunction &MF = DAG.getMachineFunction();
1362   unsigned ShaderType = MF.getInfo<R600MachineFunctionInfo>()->ShaderType;
1363
1364   SmallVector<ISD::InputArg, 8> LocalIns;
1365
1366   getOriginalFunctionArgs(DAG, DAG.getMachineFunction().getFunction(), Ins,
1367                           LocalIns);
1368
1369   AnalyzeFormalArguments(CCInfo, LocalIns);
1370
1371   for (unsigned i = 0, e = Ins.size(); i < e; ++i) {
1372     CCValAssign &VA = ArgLocs[i];
1373     EVT VT = Ins[i].VT;
1374     EVT MemVT = LocalIns[i].VT;
1375
1376     if (ShaderType != ShaderType::COMPUTE) {
1377       unsigned Reg = MF.addLiveIn(VA.getLocReg(), &AMDGPU::R600_Reg128RegClass);
1378       SDValue Register = DAG.getCopyFromReg(Chain, DL, Reg, VT);
1379       InVals.push_back(Register);
1380       continue;
1381     }
1382
1383     PointerType *PtrTy = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
1384                                                    AMDGPUAS::CONSTANT_BUFFER_0);
1385
1386     // The first 36 bytes of the input buffer contains information about
1387     // thread group and global sizes.
1388     SDValue Arg = DAG.getExtLoad(ISD::SEXTLOAD, DL, VT, Chain,
1389                                  DAG.getConstant(36 + VA.getLocMemOffset(), MVT::i32),
1390                                  MachinePointerInfo(UndefValue::get(PtrTy)),
1391                                  MemVT, false, false, 4);
1392     // 4 is the preferred alignment for
1393     // the CONSTANT memory space.
1394     InVals.push_back(Arg);
1395   }
1396   return Chain;
1397 }
1398
1399 EVT R600TargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const {
1400    if (!VT.isVector()) return MVT::i32;
1401    return VT.changeVectorElementTypeToInteger();
1402 }
1403
1404 static SDValue
1405 CompactSwizzlableVector(SelectionDAG &DAG, SDValue VectorEntry,
1406                         DenseMap<unsigned, unsigned> &RemapSwizzle) {
1407   assert(VectorEntry.getOpcode() == ISD::BUILD_VECTOR);
1408   assert(RemapSwizzle.empty());
1409   SDValue NewBldVec[4] = {
1410       VectorEntry.getOperand(0),
1411       VectorEntry.getOperand(1),
1412       VectorEntry.getOperand(2),
1413       VectorEntry.getOperand(3)
1414   };
1415
1416   for (unsigned i = 0; i < 4; i++) {
1417     if (NewBldVec[i].getOpcode() == ISD::UNDEF)
1418       // We mask write here to teach later passes that the ith element of this
1419       // vector is undef. Thus we can use it to reduce 128 bits reg usage,
1420       // break false dependencies and additionnaly make assembly easier to read.
1421       RemapSwizzle[i] = 7; // SEL_MASK_WRITE
1422     if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(NewBldVec[i])) {
1423       if (C->isZero()) {
1424         RemapSwizzle[i] = 4; // SEL_0
1425         NewBldVec[i] = DAG.getUNDEF(MVT::f32);
1426       } else if (C->isExactlyValue(1.0)) {
1427         RemapSwizzle[i] = 5; // SEL_1
1428         NewBldVec[i] = DAG.getUNDEF(MVT::f32);
1429       }
1430     }
1431
1432     if (NewBldVec[i].getOpcode() == ISD::UNDEF)
1433       continue;
1434     for (unsigned j = 0; j < i; j++) {
1435       if (NewBldVec[i] == NewBldVec[j]) {
1436         NewBldVec[i] = DAG.getUNDEF(NewBldVec[i].getValueType());
1437         RemapSwizzle[i] = j;
1438         break;
1439       }
1440     }
1441   }
1442
1443   return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(VectorEntry),
1444       VectorEntry.getValueType(), NewBldVec, 4);
1445 }
1446
1447 static SDValue ReorganizeVector(SelectionDAG &DAG, SDValue VectorEntry,
1448                                 DenseMap<unsigned, unsigned> &RemapSwizzle) {
1449   assert(VectorEntry.getOpcode() == ISD::BUILD_VECTOR);
1450   assert(RemapSwizzle.empty());
1451   SDValue NewBldVec[4] = {
1452       VectorEntry.getOperand(0),
1453       VectorEntry.getOperand(1),
1454       VectorEntry.getOperand(2),
1455       VectorEntry.getOperand(3)
1456   };
1457   bool isUnmovable[4] = { false, false, false, false };
1458   for (unsigned i = 0; i < 4; i++) {
1459     RemapSwizzle[i] = i;
1460     if (NewBldVec[i].getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
1461       unsigned Idx = dyn_cast<ConstantSDNode>(NewBldVec[i].getOperand(1))
1462           ->getZExtValue();
1463       if (i == Idx)
1464         isUnmovable[Idx] = true;
1465     }
1466   }
1467
1468   for (unsigned i = 0; i < 4; i++) {
1469     if (NewBldVec[i].getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
1470       unsigned Idx = dyn_cast<ConstantSDNode>(NewBldVec[i].getOperand(1))
1471           ->getZExtValue();
1472       if (isUnmovable[Idx])
1473         continue;
1474       // Swap i and Idx
1475       std::swap(NewBldVec[Idx], NewBldVec[i]);
1476       std::swap(RemapSwizzle[i], RemapSwizzle[Idx]);
1477       break;
1478     }
1479   }
1480
1481   return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(VectorEntry),
1482       VectorEntry.getValueType(), NewBldVec, 4);
1483 }
1484
1485
1486 SDValue R600TargetLowering::OptimizeSwizzle(SDValue BuildVector,
1487 SDValue Swz[4], SelectionDAG &DAG) const {
1488   assert(BuildVector.getOpcode() == ISD::BUILD_VECTOR);
1489   // Old -> New swizzle values
1490   DenseMap<unsigned, unsigned> SwizzleRemap;
1491
1492   BuildVector = CompactSwizzlableVector(DAG, BuildVector, SwizzleRemap);
1493   for (unsigned i = 0; i < 4; i++) {
1494     unsigned Idx = dyn_cast<ConstantSDNode>(Swz[i])->getZExtValue();
1495     if (SwizzleRemap.find(Idx) != SwizzleRemap.end())
1496       Swz[i] = DAG.getConstant(SwizzleRemap[Idx], MVT::i32);
1497   }
1498
1499   SwizzleRemap.clear();
1500   BuildVector = ReorganizeVector(DAG, BuildVector, SwizzleRemap);
1501   for (unsigned i = 0; i < 4; i++) {
1502     unsigned Idx = dyn_cast<ConstantSDNode>(Swz[i])->getZExtValue();
1503     if (SwizzleRemap.find(Idx) != SwizzleRemap.end())
1504       Swz[i] = DAG.getConstant(SwizzleRemap[Idx], MVT::i32);
1505   }
1506
1507   return BuildVector;
1508 }
1509
1510
1511 //===----------------------------------------------------------------------===//
1512 // Custom DAG Optimizations
1513 //===----------------------------------------------------------------------===//
1514
1515 SDValue R600TargetLowering::PerformDAGCombine(SDNode *N,
1516                                               DAGCombinerInfo &DCI) const {
1517   SelectionDAG &DAG = DCI.DAG;
1518
1519   switch (N->getOpcode()) {
1520   // (f32 fp_round (f64 uint_to_fp a)) -> (f32 uint_to_fp a)
1521   case ISD::FP_ROUND: {
1522       SDValue Arg = N->getOperand(0);
1523       if (Arg.getOpcode() == ISD::UINT_TO_FP && Arg.getValueType() == MVT::f64) {
1524         return DAG.getNode(ISD::UINT_TO_FP, SDLoc(N), N->getValueType(0),
1525                            Arg.getOperand(0));
1526       }
1527       break;
1528     }
1529
1530   // (i32 fp_to_sint (fneg (select_cc f32, f32, 1.0, 0.0 cc))) ->
1531   // (i32 select_cc f32, f32, -1, 0 cc)
1532   //
1533   // Mesa's GLSL frontend generates the above pattern a lot and we can lower
1534   // this to one of the SET*_DX10 instructions.
1535   case ISD::FP_TO_SINT: {
1536     SDValue FNeg = N->getOperand(0);
1537     if (FNeg.getOpcode() != ISD::FNEG) {
1538       return SDValue();
1539     }
1540     SDValue SelectCC = FNeg.getOperand(0);
1541     if (SelectCC.getOpcode() != ISD::SELECT_CC ||
1542         SelectCC.getOperand(0).getValueType() != MVT::f32 || // LHS
1543         SelectCC.getOperand(2).getValueType() != MVT::f32 || // True
1544         !isHWTrueValue(SelectCC.getOperand(2)) ||
1545         !isHWFalseValue(SelectCC.getOperand(3))) {
1546       return SDValue();
1547     }
1548
1549     return DAG.getNode(ISD::SELECT_CC, SDLoc(N), N->getValueType(0),
1550                            SelectCC.getOperand(0), // LHS
1551                            SelectCC.getOperand(1), // RHS
1552                            DAG.getConstant(-1, MVT::i32), // True
1553                            DAG.getConstant(0, MVT::i32),  // Flase
1554                            SelectCC.getOperand(4)); // CC
1555
1556     break;
1557   }
1558
1559   // insert_vector_elt (build_vector elt0, ... , eltN), NewEltIdx, idx
1560   // => build_vector elt0, ... , NewEltIdx, ... , eltN
1561   case ISD::INSERT_VECTOR_ELT: {
1562     SDValue InVec = N->getOperand(0);
1563     SDValue InVal = N->getOperand(1);
1564     SDValue EltNo = N->getOperand(2);
1565     SDLoc dl(N);
1566
1567     // If the inserted element is an UNDEF, just use the input vector.
1568     if (InVal.getOpcode() == ISD::UNDEF)
1569       return InVec;
1570
1571     EVT VT = InVec.getValueType();
1572
1573     // If we can't generate a legal BUILD_VECTOR, exit
1574     if (!isOperationLegal(ISD::BUILD_VECTOR, VT))
1575       return SDValue();
1576
1577     // Check that we know which element is being inserted
1578     if (!isa<ConstantSDNode>(EltNo))
1579       return SDValue();
1580     unsigned Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
1581
1582     // Check that the operand is a BUILD_VECTOR (or UNDEF, which can essentially
1583     // be converted to a BUILD_VECTOR).  Fill in the Ops vector with the
1584     // vector elements.
1585     SmallVector<SDValue, 8> Ops;
1586     if (InVec.getOpcode() == ISD::BUILD_VECTOR) {
1587       Ops.append(InVec.getNode()->op_begin(),
1588                  InVec.getNode()->op_end());
1589     } else if (InVec.getOpcode() == ISD::UNDEF) {
1590       unsigned NElts = VT.getVectorNumElements();
1591       Ops.append(NElts, DAG.getUNDEF(InVal.getValueType()));
1592     } else {
1593       return SDValue();
1594     }
1595
1596     // Insert the element
1597     if (Elt < Ops.size()) {
1598       // All the operands of BUILD_VECTOR must have the same type;
1599       // we enforce that here.
1600       EVT OpVT = Ops[0].getValueType();
1601       if (InVal.getValueType() != OpVT)
1602         InVal = OpVT.bitsGT(InVal.getValueType()) ?
1603           DAG.getNode(ISD::ANY_EXTEND, dl, OpVT, InVal) :
1604           DAG.getNode(ISD::TRUNCATE, dl, OpVT, InVal);
1605       Ops[Elt] = InVal;
1606     }
1607
1608     // Return the new vector
1609     return DAG.getNode(ISD::BUILD_VECTOR, dl,
1610                        VT, &Ops[0], Ops.size());
1611   }
1612
1613   // Extract_vec (Build_vector) generated by custom lowering
1614   // also needs to be customly combined
1615   case ISD::EXTRACT_VECTOR_ELT: {
1616     SDValue Arg = N->getOperand(0);
1617     if (Arg.getOpcode() == ISD::BUILD_VECTOR) {
1618       if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
1619         unsigned Element = Const->getZExtValue();
1620         return Arg->getOperand(Element);
1621       }
1622     }
1623     if (Arg.getOpcode() == ISD::BITCAST &&
1624         Arg.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
1625       if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
1626         unsigned Element = Const->getZExtValue();
1627         return DAG.getNode(ISD::BITCAST, SDLoc(N), N->getVTList(),
1628             Arg->getOperand(0).getOperand(Element));
1629       }
1630     }
1631   }
1632
1633   case ISD::SELECT_CC: {
1634     // fold selectcc (selectcc x, y, a, b, cc), b, a, b, seteq ->
1635     //      selectcc x, y, a, b, inv(cc)
1636     //
1637     // fold selectcc (selectcc x, y, a, b, cc), b, a, b, setne ->
1638     //      selectcc x, y, a, b, cc
1639     SDValue LHS = N->getOperand(0);
1640     if (LHS.getOpcode() != ISD::SELECT_CC) {
1641       return SDValue();
1642     }
1643
1644     SDValue RHS = N->getOperand(1);
1645     SDValue True = N->getOperand(2);
1646     SDValue False = N->getOperand(3);
1647     ISD::CondCode NCC = cast<CondCodeSDNode>(N->getOperand(4))->get();
1648
1649     if (LHS.getOperand(2).getNode() != True.getNode() ||
1650         LHS.getOperand(3).getNode() != False.getNode() ||
1651         RHS.getNode() != False.getNode()) {
1652       return SDValue();
1653     }
1654
1655     switch (NCC) {
1656     default: return SDValue();
1657     case ISD::SETNE: return LHS;
1658     case ISD::SETEQ: {
1659       ISD::CondCode LHSCC = cast<CondCodeSDNode>(LHS.getOperand(4))->get();
1660       LHSCC = ISD::getSetCCInverse(LHSCC,
1661                                   LHS.getOperand(0).getValueType().isInteger());
1662       if (DCI.isBeforeLegalizeOps() ||
1663           isCondCodeLegal(LHSCC, LHS.getOperand(0).getSimpleValueType()))
1664         return DAG.getSelectCC(SDLoc(N),
1665                                LHS.getOperand(0),
1666                                LHS.getOperand(1),
1667                                LHS.getOperand(2),
1668                                LHS.getOperand(3),
1669                                LHSCC);
1670       break;
1671     }
1672     }
1673     return SDValue();
1674   }
1675
1676   case AMDGPUISD::EXPORT: {
1677     SDValue Arg = N->getOperand(1);
1678     if (Arg.getOpcode() != ISD::BUILD_VECTOR)
1679       break;
1680
1681     SDValue NewArgs[8] = {
1682       N->getOperand(0), // Chain
1683       SDValue(),
1684       N->getOperand(2), // ArrayBase
1685       N->getOperand(3), // Type
1686       N->getOperand(4), // SWZ_X
1687       N->getOperand(5), // SWZ_Y
1688       N->getOperand(6), // SWZ_Z
1689       N->getOperand(7) // SWZ_W
1690     };
1691     SDLoc DL(N);
1692     NewArgs[1] = OptimizeSwizzle(N->getOperand(1), &NewArgs[4], DAG);
1693     return DAG.getNode(AMDGPUISD::EXPORT, DL, N->getVTList(), NewArgs, 8);
1694   }
1695   case AMDGPUISD::TEXTURE_FETCH: {
1696     SDValue Arg = N->getOperand(1);
1697     if (Arg.getOpcode() != ISD::BUILD_VECTOR)
1698       break;
1699
1700     SDValue NewArgs[19] = {
1701       N->getOperand(0),
1702       N->getOperand(1),
1703       N->getOperand(2),
1704       N->getOperand(3),
1705       N->getOperand(4),
1706       N->getOperand(5),
1707       N->getOperand(6),
1708       N->getOperand(7),
1709       N->getOperand(8),
1710       N->getOperand(9),
1711       N->getOperand(10),
1712       N->getOperand(11),
1713       N->getOperand(12),
1714       N->getOperand(13),
1715       N->getOperand(14),
1716       N->getOperand(15),
1717       N->getOperand(16),
1718       N->getOperand(17),
1719       N->getOperand(18),
1720     };
1721     NewArgs[1] = OptimizeSwizzle(N->getOperand(1), &NewArgs[2], DAG);
1722     return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, SDLoc(N), N->getVTList(),
1723         NewArgs, 19);
1724   }
1725   }
1726   return SDValue();
1727 }
1728
1729 static bool
1730 FoldOperand(SDNode *ParentNode, unsigned SrcIdx, SDValue &Src, SDValue &Neg,
1731             SDValue &Abs, SDValue &Sel, SDValue &Imm, SelectionDAG &DAG) {
1732   const R600InstrInfo *TII =
1733       static_cast<const R600InstrInfo *>(DAG.getTarget().getInstrInfo());
1734   if (!Src.isMachineOpcode())
1735     return false;
1736   switch (Src.getMachineOpcode()) {
1737   case AMDGPU::FNEG_R600:
1738     if (!Neg.getNode())
1739       return false;
1740     Src = Src.getOperand(0);
1741     Neg = DAG.getTargetConstant(1, MVT::i32);
1742     return true;
1743   case AMDGPU::FABS_R600:
1744     if (!Abs.getNode())
1745       return false;
1746     Src = Src.getOperand(0);
1747     Abs = DAG.getTargetConstant(1, MVT::i32);
1748     return true;
1749   case AMDGPU::CONST_COPY: {
1750     unsigned Opcode = ParentNode->getMachineOpcode();
1751     bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1;
1752
1753     if (!Sel.getNode())
1754       return false;
1755
1756     SDValue CstOffset = Src.getOperand(0);
1757     if (ParentNode->getValueType(0).isVector())
1758       return false;
1759
1760     // Gather constants values
1761     int SrcIndices[] = {
1762       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0),
1763       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1),
1764       TII->getOperandIdx(Opcode, AMDGPU::OpName::src2),
1765       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_X),
1766       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Y),
1767       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Z),
1768       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_W),
1769       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_X),
1770       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Y),
1771       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Z),
1772       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_W)
1773     };
1774     std::vector<unsigned> Consts;
1775     for (unsigned i = 0; i < sizeof(SrcIndices) / sizeof(int); i++) {
1776       int OtherSrcIdx = SrcIndices[i];
1777       int OtherSelIdx = TII->getSelIdx(Opcode, OtherSrcIdx);
1778       if (OtherSrcIdx < 0 || OtherSelIdx < 0)
1779         continue;
1780       if (HasDst) {
1781         OtherSrcIdx--;
1782         OtherSelIdx--;
1783       }
1784       if (RegisterSDNode *Reg =
1785           dyn_cast<RegisterSDNode>(ParentNode->getOperand(OtherSrcIdx))) {
1786         if (Reg->getReg() == AMDGPU::ALU_CONST) {
1787           ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(
1788               ParentNode->getOperand(OtherSelIdx));
1789           Consts.push_back(Cst->getZExtValue());
1790         }
1791       }
1792     }
1793
1794     ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(CstOffset);
1795     Consts.push_back(Cst->getZExtValue());
1796     if (!TII->fitsConstReadLimitations(Consts)) {
1797       return false;
1798     }
1799
1800     Sel = CstOffset;
1801     Src = DAG.getRegister(AMDGPU::ALU_CONST, MVT::f32);
1802     return true;
1803   }
1804   case AMDGPU::MOV_IMM_I32:
1805   case AMDGPU::MOV_IMM_F32: {
1806     unsigned ImmReg = AMDGPU::ALU_LITERAL_X;
1807     uint64_t ImmValue = 0;
1808
1809
1810     if (Src.getMachineOpcode() == AMDGPU::MOV_IMM_F32) {
1811       ConstantFPSDNode *FPC = dyn_cast<ConstantFPSDNode>(Src.getOperand(0));
1812       float FloatValue = FPC->getValueAPF().convertToFloat();
1813       if (FloatValue == 0.0) {
1814         ImmReg = AMDGPU::ZERO;
1815       } else if (FloatValue == 0.5) {
1816         ImmReg = AMDGPU::HALF;
1817       } else if (FloatValue == 1.0) {
1818         ImmReg = AMDGPU::ONE;
1819       } else {
1820         ImmValue = FPC->getValueAPF().bitcastToAPInt().getZExtValue();
1821       }
1822     } else {
1823       ConstantSDNode *C = dyn_cast<ConstantSDNode>(Src.getOperand(0));
1824       uint64_t Value = C->getZExtValue();
1825       if (Value == 0) {
1826         ImmReg = AMDGPU::ZERO;
1827       } else if (Value == 1) {
1828         ImmReg = AMDGPU::ONE_INT;
1829       } else {
1830         ImmValue = Value;
1831       }
1832     }
1833
1834     // Check that we aren't already using an immediate.
1835     // XXX: It's possible for an instruction to have more than one
1836     // immediate operand, but this is not supported yet.
1837     if (ImmReg == AMDGPU::ALU_LITERAL_X) {
1838       if (!Imm.getNode())
1839         return false;
1840       ConstantSDNode *C = dyn_cast<ConstantSDNode>(Imm);
1841       assert(C);
1842       if (C->getZExtValue())
1843         return false;
1844       Imm = DAG.getTargetConstant(ImmValue, MVT::i32);
1845     }
1846     Src = DAG.getRegister(ImmReg, MVT::i32);
1847     return true;
1848   }
1849   default:
1850     return false;
1851   }
1852 }
1853
1854
1855 /// \brief Fold the instructions after selecting them
1856 SDNode *R600TargetLowering::PostISelFolding(MachineSDNode *Node,
1857                                             SelectionDAG &DAG) const {
1858   const R600InstrInfo *TII =
1859       static_cast<const R600InstrInfo *>(DAG.getTarget().getInstrInfo());
1860   if (!Node->isMachineOpcode())
1861     return Node;
1862   unsigned Opcode = Node->getMachineOpcode();
1863   SDValue FakeOp;
1864
1865   std::vector<SDValue> Ops;
1866   for(SDNode::op_iterator I = Node->op_begin(), E = Node->op_end();
1867               I != E; ++I)
1868           Ops.push_back(*I);
1869
1870   if (Opcode == AMDGPU::DOT_4) {
1871     int OperandIdx[] = {
1872       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_X),
1873       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Y),
1874       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Z),
1875       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_W),
1876       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_X),
1877       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Y),
1878       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Z),
1879       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_W)
1880         };
1881     int NegIdx[] = {
1882       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_X),
1883       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_Y),
1884       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_Z),
1885       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_W),
1886       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_X),
1887       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_Y),
1888       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_Z),
1889       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_W)
1890     };
1891     int AbsIdx[] = {
1892       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_X),
1893       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_Y),
1894       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_Z),
1895       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_W),
1896       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_X),
1897       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_Y),
1898       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_Z),
1899       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_W)
1900     };
1901     for (unsigned i = 0; i < 8; i++) {
1902       if (OperandIdx[i] < 0)
1903         return Node;
1904       SDValue &Src = Ops[OperandIdx[i] - 1];
1905       SDValue &Neg = Ops[NegIdx[i] - 1];
1906       SDValue &Abs = Ops[AbsIdx[i] - 1];
1907       bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1;
1908       int SelIdx = TII->getSelIdx(Opcode, OperandIdx[i]);
1909       if (HasDst)
1910         SelIdx--;
1911       SDValue &Sel = (SelIdx > -1) ? Ops[SelIdx] : FakeOp;
1912       if (FoldOperand(Node, i, Src, Neg, Abs, Sel, FakeOp, DAG))
1913         return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
1914     }
1915   } else if (Opcode == AMDGPU::REG_SEQUENCE) {
1916     for (unsigned i = 1, e = Node->getNumOperands(); i < e; i += 2) {
1917       SDValue &Src = Ops[i];
1918       if (FoldOperand(Node, i, Src, FakeOp, FakeOp, FakeOp, FakeOp, DAG))
1919         return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
1920     }
1921   } else if (Opcode == AMDGPU::CLAMP_R600) {
1922     SDValue Src = Node->getOperand(0);
1923     if (!Src.isMachineOpcode() ||
1924         !TII->hasInstrModifiers(Src.getMachineOpcode()))
1925       return Node;
1926     int ClampIdx = TII->getOperandIdx(Src.getMachineOpcode(),
1927         AMDGPU::OpName::clamp);
1928     if (ClampIdx < 0)
1929       return Node;
1930     std::vector<SDValue> Ops;
1931     unsigned NumOp = Src.getNumOperands();
1932     for(unsigned i = 0; i < NumOp; ++i)
1933           Ops.push_back(Src.getOperand(i));
1934     Ops[ClampIdx - 1] = DAG.getTargetConstant(1, MVT::i32);
1935     return DAG.getMachineNode(Src.getMachineOpcode(), SDLoc(Node),
1936         Node->getVTList(), Ops);
1937   } else {
1938     if (!TII->hasInstrModifiers(Opcode))
1939       return Node;
1940     int OperandIdx[] = {
1941       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0),
1942       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1),
1943       TII->getOperandIdx(Opcode, AMDGPU::OpName::src2)
1944     };
1945     int NegIdx[] = {
1946       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg),
1947       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg),
1948       TII->getOperandIdx(Opcode, AMDGPU::OpName::src2_neg)
1949     };
1950     int AbsIdx[] = {
1951       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs),
1952       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs),
1953       -1
1954     };
1955     for (unsigned i = 0; i < 3; i++) {
1956       if (OperandIdx[i] < 0)
1957         return Node;
1958       SDValue &Src = Ops[OperandIdx[i] - 1];
1959       SDValue &Neg = Ops[NegIdx[i] - 1];
1960       SDValue FakeAbs;
1961       SDValue &Abs = (AbsIdx[i] > -1) ? Ops[AbsIdx[i] - 1] : FakeAbs;
1962       bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1;
1963       int SelIdx = TII->getSelIdx(Opcode, OperandIdx[i]);
1964       int ImmIdx = TII->getOperandIdx(Opcode, AMDGPU::OpName::literal);
1965       if (HasDst) {
1966         SelIdx--;
1967         ImmIdx--;
1968       }
1969       SDValue &Sel = (SelIdx > -1) ? Ops[SelIdx] : FakeOp;
1970       SDValue &Imm = Ops[ImmIdx];
1971       if (FoldOperand(Node, i, Src, Neg, Abs, Sel, Imm, DAG))
1972         return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
1973     }
1974   }
1975
1976   return Node;
1977 }