lib/Target/R600/R600ISelLowering.cpp

   1 //===-- R600ISelLowering.cpp - R600 DAG Lowering Implementation -----------===//
   2 //
   3 //                     The LLVM Compiler Infrastructure
   4 //
   5 // This file is distributed under the University of Illinois Open Source
   6 // License. See LICENSE.TXT for details.
   7 //
   8 //===----------------------------------------------------------------------===//
   9 //
  10 /// \file
  11 /// \brief Custom DAG lowering for R600
  12 //
  13 //===----------------------------------------------------------------------===//
  14
  15 #include "R600ISelLowering.h"
  16 #include "R600Defines.h"
  17 #include "R600InstrInfo.h"
  18 #include "R600MachineFunctionInfo.h"
  19 #include "llvm/CodeGen/MachineFrameInfo.h"
  20 #include "llvm/CodeGen/MachineInstrBuilder.h"
  21 #include "llvm/CodeGen/MachineRegisterInfo.h"
  22 #include "llvm/CodeGen/SelectionDAG.h"
  23 #include "llvm/IR/Argument.h"
  24 #include "llvm/IR/Function.h"
  25
  26 using namespace llvm;
  27
  28 R600TargetLowering::R600TargetLowering(TargetMachine &TM) :
  29     AMDGPUTargetLowering(TM),
  30     TII(static_cast<const R600InstrInfo*>(TM.getInstrInfo())) {
  31   addRegisterClass(MVT::v4f32, &AMDGPU::R600_Reg128RegClass);
  32   addRegisterClass(MVT::f32, &AMDGPU::R600_Reg32RegClass);
  33   addRegisterClass(MVT::v4i32, &AMDGPU::R600_Reg128RegClass);
  34   addRegisterClass(MVT::i32, &AMDGPU::R600_Reg32RegClass);
  35   computeRegisterProperties();
  36
  37   setOperationAction(ISD::FADD, MVT::v4f32, Expand);
  38   setOperationAction(ISD::FMUL, MVT::v4f32, Expand);
  39   setOperationAction(ISD::FDIV, MVT::v4f32, Expand);
  40   setOperationAction(ISD::FSUB, MVT::v4f32, Expand);
  41
  42   setOperationAction(ISD::ADD,  MVT::v4i32, Expand);
  43   setOperationAction(ISD::AND,  MVT::v4i32, Expand);
  44   setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Expand);
  45   setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Expand);
  46   setOperationAction(ISD::MUL,  MVT::v2i32, Expand);
  47   setOperationAction(ISD::MUL,  MVT::v4i32, Expand);
  48   setOperationAction(ISD::OR, MVT::v4i32, Expand);
  49   setOperationAction(ISD::OR, MVT::v2i32, Expand);
  50   setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Expand);
  51   setOperationAction(ISD::SHL, MVT::v4i32, Expand);
  52   setOperationAction(ISD::SHL, MVT::v2i32, Expand);
  53   setOperationAction(ISD::SRL, MVT::v4i32, Expand);
  54   setOperationAction(ISD::SRL, MVT::v2i32, Expand);
  55   setOperationAction(ISD::SRA, MVT::v4i32, Expand);
  56   setOperationAction(ISD::SRA, MVT::v2i32, Expand);
  57   setOperationAction(ISD::SUB, MVT::v4i32, Expand);
  58   setOperationAction(ISD::SUB, MVT::v2i32, Expand);
  59   setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Expand);
  60   setOperationAction(ISD::UDIV, MVT::v4i32, Expand);
  61   setOperationAction(ISD::UREM, MVT::v4i32, Expand);
  62   setOperationAction(ISD::SETCC, MVT::v4i32, Expand);
  63   setOperationAction(ISD::XOR, MVT::v4i32, Expand);
  64   setOperationAction(ISD::XOR, MVT::v2i32, Expand);
  65
  66   setOperationAction(ISD::BR_CC, MVT::i32, Expand);
  67   setOperationAction(ISD::BR_CC, MVT::f32, Expand);
  68
  69   setOperationAction(ISD::FSUB, MVT::f32, Expand);
  70
  71   setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
  72   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
  73   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i1, Custom);
  74
  75   setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
  76   setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
  77
  78   setOperationAction(ISD::SETCC, MVT::i32, Expand);
  79   setOperationAction(ISD::SETCC, MVT::f32, Expand);
  80   setOperationAction(ISD::FP_TO_UINT, MVT::i1, Custom);
  81
  82   setOperationAction(ISD::SELECT, MVT::i32, Custom);
  83   setOperationAction(ISD::SELECT, MVT::f32, Custom);
  84
  85   setOperationAction(ISD::VSELECT, MVT::v4i32, Expand);
  86   setOperationAction(ISD::VSELECT, MVT::v2i32, Expand);
  87
  88   // Legalize loads and stores to the private address space.
  89   setOperationAction(ISD::LOAD, MVT::i32, Custom);
  90   setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
  91   setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
  92   setLoadExtAction(ISD::EXTLOAD, MVT::v4i8, Custom);
  93   setLoadExtAction(ISD::EXTLOAD, MVT::i8, Custom);
  94   setLoadExtAction(ISD::ZEXTLOAD, MVT::i8, Custom);
  95   setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i8, Custom);
  96   setOperationAction(ISD::STORE, MVT::i8, Custom);
  97   setOperationAction(ISD::STORE, MVT::i32, Custom);
  98   setOperationAction(ISD::STORE, MVT::v2i32, Custom);
  99   setOperationAction(ISD::STORE, MVT::v4i32, Custom);
 100
 101   setOperationAction(ISD::LOAD, MVT::i32, Custom);
 102   setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
 103   setOperationAction(ISD::FrameIndex, MVT::i32, Custom);
 104
 105   setTargetDAGCombine(ISD::FP_ROUND);
 106   setTargetDAGCombine(ISD::FP_TO_SINT);
 107   setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
 108   setTargetDAGCombine(ISD::SELECT_CC);
 109
 110   setBooleanContents(ZeroOrNegativeOneBooleanContent);
 111   setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
 112   setSchedulingPreference(Sched::VLIW);
 113 }
 114
 115 MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter(
 116     MachineInstr * MI, MachineBasicBlock * BB) const {
 117   MachineFunction * MF = BB->getParent();
 118   MachineRegisterInfo &MRI = MF->getRegInfo();
 119   MachineBasicBlock::iterator I = *MI;
 120
 121   switch (MI->getOpcode()) {
 122   default: return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
 123   case AMDGPU::CLAMP_R600: {
 124     MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
 125                                                    AMDGPU::MOV,
 126                                                    MI->getOperand(0).getReg(),
 127                                                    MI->getOperand(1).getReg());
 128     TII->addFlag(NewMI, 0, MO_FLAG_CLAMP);
 129     break;
 130   }
 131
 132   case AMDGPU::FABS_R600: {
 133     MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
 134                                                     AMDGPU::MOV,
 135                                                     MI->getOperand(0).getReg(),
 136                                                     MI->getOperand(1).getReg());
 137     TII->addFlag(NewMI, 0, MO_FLAG_ABS);
 138     break;
 139   }
 140
 141   case AMDGPU::FNEG_R600: {
 142     MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
 143                                                     AMDGPU::MOV,
 144                                                     MI->getOperand(0).getReg(),
 145                                                     MI->getOperand(1).getReg());
 146     TII->addFlag(NewMI, 0, MO_FLAG_NEG);
 147     break;
 148   }
 149
 150   case AMDGPU::MASK_WRITE: {
 151     unsigned maskedRegister = MI->getOperand(0).getReg();
 152     assert(TargetRegisterInfo::isVirtualRegister(maskedRegister));
 153     MachineInstr * defInstr = MRI.getVRegDef(maskedRegister);
 154     TII->addFlag(defInstr, 0, MO_FLAG_MASK);
 155     break;
 156   }
 157
 158   case AMDGPU::MOV_IMM_F32:
 159     TII->buildMovImm(*BB, I, MI->getOperand(0).getReg(),
 160                      MI->getOperand(1).getFPImm()->getValueAPF()
 161                          .bitcastToAPInt().getZExtValue());
 162     break;
 163   case AMDGPU::MOV_IMM_I32:
 164     TII->buildMovImm(*BB, I, MI->getOperand(0).getReg(),
 165                      MI->getOperand(1).getImm());
 166     break;
 167   case AMDGPU::CONST_COPY: {
 168     MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, MI, AMDGPU::MOV,
 169         MI->getOperand(0).getReg(), AMDGPU::ALU_CONST);
 170     TII->setImmOperand(NewMI, R600Operands::SRC0_SEL,
 171         MI->getOperand(1).getImm());
 172     break;
 173   }
 174
 175   case AMDGPU::RAT_WRITE_CACHELESS_32_eg:
 176   case AMDGPU::RAT_WRITE_CACHELESS_128_eg: {
 177     unsigned EOP = (llvm::next(I)->getOpcode() == AMDGPU::RETURN) ? 1 : 0;
 178
 179     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode()))
 180             .addOperand(MI->getOperand(0))
 181             .addOperand(MI->getOperand(1))
 182             .addImm(EOP); // Set End of program bit
 183     break;
 184   }
 185
 186   case AMDGPU::TXD: {
 187     unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
 188     unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
 189     MachineOperand &RID = MI->getOperand(4);
 190     MachineOperand &SID = MI->getOperand(5);
 191     unsigned TextureId = MI->getOperand(6).getImm();
 192     unsigned SrcX = 0, SrcY = 1, SrcZ = 2, SrcW = 3;
 193     unsigned CTX = 1, CTY = 1, CTZ = 1, CTW = 1;
 194
 195     switch (TextureId) {
 196     case 5: // Rect
 197       CTX = CTY = 0;
 198       break;
 199     case 6: // Shadow1D
 200       SrcW = SrcZ;
 201       break;
 202     case 7: // Shadow2D
 203       SrcW = SrcZ;
 204       break;
 205     case 8: // ShadowRect
 206       CTX = CTY = 0;
 207       SrcW = SrcZ;
 208       break;
 209     case 9: // 1DArray
 210       SrcZ = SrcY;
 211       CTZ = 0;
 212       break;
 213     case 10: // 2DArray
 214       CTZ = 0;
 215       break;
 216     case 11: // Shadow1DArray
 217       SrcZ = SrcY;
 218       CTZ = 0;
 219       break;
 220     case 12: // Shadow2DArray
 221       CTZ = 0;
 222       break;
 223     }
 224     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), T0)
 225             .addOperand(MI->getOperand(3))
 226             .addImm(SrcX)
 227             .addImm(SrcY)
 228             .addImm(SrcZ)
 229             .addImm(SrcW)
 230             .addImm(0)
 231             .addImm(0)
 232             .addImm(0)
 233             .addImm(0)
 234             .addImm(1)
 235             .addImm(2)
 236             .addImm(3)
 237             .addOperand(RID)
 238             .addOperand(SID)
 239             .addImm(CTX)
 240             .addImm(CTY)
 241             .addImm(CTZ)
 242             .addImm(CTW);
 243     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), T1)
 244             .addOperand(MI->getOperand(2))
 245             .addImm(SrcX)
 246             .addImm(SrcY)
 247             .addImm(SrcZ)
 248             .addImm(SrcW)
 249             .addImm(0)
 250             .addImm(0)
 251             .addImm(0)
 252             .addImm(0)
 253             .addImm(1)
 254             .addImm(2)
 255             .addImm(3)
 256             .addOperand(RID)
 257             .addOperand(SID)
 258             .addImm(CTX)
 259             .addImm(CTY)
 260             .addImm(CTZ)
 261             .addImm(CTW);
 262     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_G))
 263             .addOperand(MI->getOperand(0))
 264             .addOperand(MI->getOperand(1))
 265             .addImm(SrcX)
 266             .addImm(SrcY)
 267             .addImm(SrcZ)
 268             .addImm(SrcW)
 269             .addImm(0)
 270             .addImm(0)
 271             .addImm(0)
 272             .addImm(0)
 273             .addImm(1)
 274             .addImm(2)
 275             .addImm(3)
 276             .addOperand(RID)
 277             .addOperand(SID)
 278             .addImm(CTX)
 279             .addImm(CTY)
 280             .addImm(CTZ)
 281             .addImm(CTW)
 282             .addReg(T0, RegState::Implicit)
 283             .addReg(T1, RegState::Implicit);
 284     break;
 285   }
 286
 287   case AMDGPU::TXD_SHADOW: {
 288     unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
 289     unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
 290     MachineOperand &RID = MI->getOperand(4);
 291     MachineOperand &SID = MI->getOperand(5);
 292     unsigned TextureId = MI->getOperand(6).getImm();
 293     unsigned SrcX = 0, SrcY = 1, SrcZ = 2, SrcW = 3;
 294     unsigned CTX = 1, CTY = 1, CTZ = 1, CTW = 1;
 295
 296     switch (TextureId) {
 297     case 5: // Rect
 298       CTX = CTY = 0;
 299       break;
 300     case 6: // Shadow1D
 301       SrcW = SrcZ;
 302       break;
 303     case 7: // Shadow2D
 304       SrcW = SrcZ;
 305       break;
 306     case 8: // ShadowRect
 307       CTX = CTY = 0;
 308       SrcW = SrcZ;
 309       break;
 310     case 9: // 1DArray
 311       SrcZ = SrcY;
 312       CTZ = 0;
 313       break;
 314     case 10: // 2DArray
 315       CTZ = 0;
 316       break;
 317     case 11: // Shadow1DArray
 318       SrcZ = SrcY;
 319       CTZ = 0;
 320       break;
 321     case 12: // Shadow2DArray
 322       CTZ = 0;
 323       break;
 324     }
 325
 326     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), T0)
 327             .addOperand(MI->getOperand(3))
 328             .addImm(SrcX)
 329             .addImm(SrcY)
 330             .addImm(SrcZ)
 331             .addImm(SrcW)
 332             .addImm(0)
 333             .addImm(0)
 334             .addImm(0)
 335             .addImm(0)
 336             .addImm(1)
 337             .addImm(2)
 338             .addImm(3)
 339             .addOperand(RID)
 340             .addOperand(SID)
 341             .addImm(CTX)
 342             .addImm(CTY)
 343             .addImm(CTZ)
 344             .addImm(CTW);
 345     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), T1)
 346             .addOperand(MI->getOperand(2))
 347             .addImm(SrcX)
 348             .addImm(SrcY)
 349             .addImm(SrcZ)
 350             .addImm(SrcW)
 351             .addImm(0)
 352             .addImm(0)
 353             .addImm(0)
 354             .addImm(0)
 355             .addImm(1)
 356             .addImm(2)
 357             .addImm(3)
 358             .addOperand(RID)
 359             .addOperand(SID)
 360             .addImm(CTX)
 361             .addImm(CTY)
 362             .addImm(CTZ)
 363             .addImm(CTW);
 364     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_C_G))
 365             .addOperand(MI->getOperand(0))
 366             .addOperand(MI->getOperand(1))
 367             .addImm(SrcX)
 368             .addImm(SrcY)
 369             .addImm(SrcZ)
 370             .addImm(SrcW)
 371             .addImm(0)
 372             .addImm(0)
 373             .addImm(0)
 374             .addImm(0)
 375             .addImm(1)
 376             .addImm(2)
 377             .addImm(3)
 378             .addOperand(RID)
 379             .addOperand(SID)
 380             .addImm(CTX)
 381             .addImm(CTY)
 382             .addImm(CTZ)
 383             .addImm(CTW)
 384             .addReg(T0, RegState::Implicit)
 385             .addReg(T1, RegState::Implicit);
 386     break;
 387   }
 388
 389   case AMDGPU::BRANCH:
 390       BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP))
 391               .addOperand(MI->getOperand(0));
 392       break;
 393
 394   case AMDGPU::BRANCH_COND_f32: {
 395     MachineInstr *NewMI =
 396       BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X),
 397               AMDGPU::PREDICATE_BIT)
 398               .addOperand(MI->getOperand(1))
 399               .addImm(OPCODE_IS_NOT_ZERO)
 400               .addImm(0); // Flags
 401     TII->addFlag(NewMI, 0, MO_FLAG_PUSH);
 402     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND))
 403             .addOperand(MI->getOperand(0))
 404             .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
 405     break;
 406   }
 407
 408   case AMDGPU::BRANCH_COND_i32: {
 409     MachineInstr *NewMI =
 410       BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X),
 411             AMDGPU::PREDICATE_BIT)
 412             .addOperand(MI->getOperand(1))
 413             .addImm(OPCODE_IS_NOT_ZERO_INT)
 414             .addImm(0); // Flags
 415     TII->addFlag(NewMI, 0, MO_FLAG_PUSH);
 416     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND))
 417            .addOperand(MI->getOperand(0))
 418             .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
 419     break;
 420   }
 421
 422   case AMDGPU::EG_ExportSwz:
 423   case AMDGPU::R600_ExportSwz: {
 424     // Instruction is left unmodified if its not the last one of its type
 425     bool isLastInstructionOfItsType = true;
 426     unsigned InstExportType = MI->getOperand(1).getImm();
 427     for (MachineBasicBlock::iterator NextExportInst = llvm::next(I),
 428          EndBlock = BB->end(); NextExportInst != EndBlock;
 429          NextExportInst = llvm::next(NextExportInst)) {
 430       if (NextExportInst->getOpcode() == AMDGPU::EG_ExportSwz ||
 431           NextExportInst->getOpcode() == AMDGPU::R600_ExportSwz) {
 432         unsigned CurrentInstExportType = NextExportInst->getOperand(1)
 433             .getImm();
 434         if (CurrentInstExportType == InstExportType) {
 435           isLastInstructionOfItsType = false;
 436           break;
 437         }
 438       }
 439     }
 440     bool EOP = (llvm::next(I)->getOpcode() == AMDGPU::RETURN)? 1 : 0;
 441     if (!EOP && !isLastInstructionOfItsType)
 442       return BB;
 443     unsigned CfInst = (MI->getOpcode() == AMDGPU::EG_ExportSwz)? 84 : 40;
 444     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode()))
 445             .addOperand(MI->getOperand(0))
 446             .addOperand(MI->getOperand(1))
 447             .addOperand(MI->getOperand(2))
 448             .addOperand(MI->getOperand(3))
 449             .addOperand(MI->getOperand(4))
 450             .addOperand(MI->getOperand(5))
 451             .addOperand(MI->getOperand(6))
 452             .addImm(CfInst)
 453             .addImm(EOP);
 454     break;
 455   }
 456   case AMDGPU::RETURN: {
 457     // RETURN instructions must have the live-out registers as implicit uses,
 458     // otherwise they appear dead.
 459     R600MachineFunctionInfo *MFI = MF->getInfo<R600MachineFunctionInfo>();
 460     MachineInstrBuilder MIB(*MF, MI);
 461     for (unsigned i = 0, e = MFI->LiveOuts.size(); i != e; ++i)
 462       MIB.addReg(MFI->LiveOuts[i], RegState::Implicit);
 463     return BB;
 464   }
 465   }
 466
 467   MI->eraseFromParent();
 468   return BB;
 469 }
 470
 471 //===----------------------------------------------------------------------===//
 472 // Custom DAG Lowering Operations
 473 //===----------------------------------------------------------------------===//
 474
 475 SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
 476   switch (Op.getOpcode()) {
 477   default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
 478   case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
 479   case ISD::SELECT: return LowerSELECT(Op, DAG);
 480   case ISD::STORE: return LowerSTORE(Op, DAG);
 481   case ISD::LOAD: return LowerLOAD(Op, DAG);
 482   case ISD::FrameIndex: return LowerFrameIndex(Op, DAG);
 483   case ISD::INTRINSIC_VOID: {
 484     SDValue Chain = Op.getOperand(0);
 485     unsigned IntrinsicID =
 486                          cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
 487     switch (IntrinsicID) {
 488     case AMDGPUIntrinsic::AMDGPU_store_output: {
 489       MachineFunction &MF = DAG.getMachineFunction();
 490       R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
 491       int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
 492       unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister(RegIndex);
 493       MFI->LiveOuts.push_back(Reg);
 494       return DAG.getCopyToReg(Chain, SDLoc(Op), Reg, Op.getOperand(2));
 495     }
 496     case AMDGPUIntrinsic::R600_store_swizzle: {
 497       const SDValue Args[8] = {
 498         Chain,
 499         Op.getOperand(2), // Export Value
 500         Op.getOperand(3), // ArrayBase
 501         Op.getOperand(4), // Type
 502         DAG.getConstant(0, MVT::i32), // SWZ_X
 503         DAG.getConstant(1, MVT::i32), // SWZ_Y
 504         DAG.getConstant(2, MVT::i32), // SWZ_Z
 505         DAG.getConstant(3, MVT::i32) // SWZ_W
 506       };
 507       return DAG.getNode(AMDGPUISD::EXPORT, SDLoc(Op), Op.getValueType(),
 508           Args, 8);
 509     }
 510
 511     // default for switch(IntrinsicID)
 512     default: break;
 513     }
 514     // break out of case ISD::INTRINSIC_VOID in switch(Op.getOpcode())
 515     break;
 516   }
 517   case ISD::INTRINSIC_WO_CHAIN: {
 518     unsigned IntrinsicID =
 519                          cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
 520     EVT VT = Op.getValueType();
 521     SDLoc DL(Op);
 522     switch(IntrinsicID) {
 523     default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
 524     case AMDGPUIntrinsic::R600_load_input: {
 525       int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
 526       unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister(RegIndex);
 527       MachineFunction &MF = DAG.getMachineFunction();
 528       MachineRegisterInfo &MRI = MF.getRegInfo();
 529       MRI.addLiveIn(Reg);
 530       return DAG.getCopyFromReg(DAG.getEntryNode(),
 531           SDLoc(DAG.getEntryNode()), Reg, VT);
 532     }
 533
 534     case AMDGPUIntrinsic::R600_interp_input: {
 535       int slot = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
 536       int ijb = cast<ConstantSDNode>(Op.getOperand(2))->getSExtValue();
 537       MachineSDNode *interp;
 538       if (ijb < 0) {
 539         interp = DAG.getMachineNode(AMDGPU::INTERP_VEC_LOAD, DL,
 540             MVT::v4f32, DAG.getTargetConstant(slot / 4 , MVT::i32));
 541         return DAG.getTargetExtractSubreg(
 542             TII->getRegisterInfo().getSubRegFromChannel(slot % 4),
 543             DL, MVT::f32, SDValue(interp, 0));
 544       }
 545
 546       MachineFunction &MF = DAG.getMachineFunction();
 547       MachineRegisterInfo &MRI = MF.getRegInfo();
 548       unsigned RegisterI = AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb);
 549       unsigned RegisterJ = AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb + 1);
 550       MRI.addLiveIn(RegisterI);
 551       MRI.addLiveIn(RegisterJ);
 552       SDValue RegisterINode = DAG.getCopyFromReg(DAG.getEntryNode(),
 553           SDLoc(DAG.getEntryNode()), RegisterI, MVT::f32);
 554       SDValue RegisterJNode = DAG.getCopyFromReg(DAG.getEntryNode(),
 555           SDLoc(DAG.getEntryNode()), RegisterJ, MVT::f32);
 556
 557       if (slot % 4 < 2)
 558         interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_XY, DL,
 559             MVT::f32, MVT::f32, DAG.getTargetConstant(slot / 4 , MVT::i32),
 560             RegisterJNode, RegisterINode);
 561       else
 562         interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_ZW, DL,
 563             MVT::f32, MVT::f32, DAG.getTargetConstant(slot / 4 , MVT::i32),
 564             RegisterJNode, RegisterINode);
 565       return SDValue(interp, slot % 2);
 566     }
 567     case AMDGPUIntrinsic::R600_tex:
 568     case AMDGPUIntrinsic::R600_texc:
 569     case AMDGPUIntrinsic::R600_txl:
 570     case AMDGPUIntrinsic::R600_txlc:
 571     case AMDGPUIntrinsic::R600_txb:
 572     case AMDGPUIntrinsic::R600_txbc:
 573     case AMDGPUIntrinsic::R600_txf:
 574     case AMDGPUIntrinsic::R600_txq:
 575     case AMDGPUIntrinsic::R600_ddx:
 576     case AMDGPUIntrinsic::R600_ddy: {
 577       unsigned TextureOp;
 578       switch (IntrinsicID) {
 579       case AMDGPUIntrinsic::R600_tex:
 580         TextureOp = 0;
 581         break;
 582       case AMDGPUIntrinsic::R600_texc:
 583         TextureOp = 1;
 584         break;
 585       case AMDGPUIntrinsic::R600_txl:
 586         TextureOp = 2;
 587         break;
 588       case AMDGPUIntrinsic::R600_txlc:
 589         TextureOp = 3;
 590         break;
 591       case AMDGPUIntrinsic::R600_txb:
 592         TextureOp = 4;
 593         break;
 594       case AMDGPUIntrinsic::R600_txbc:
 595         TextureOp = 5;
 596         break;
 597       case AMDGPUIntrinsic::R600_txf:
 598         TextureOp = 6;
 599         break;
 600       case AMDGPUIntrinsic::R600_txq:
 601         TextureOp = 7;
 602         break;
 603       case AMDGPUIntrinsic::R600_ddx:
 604         TextureOp = 8;
 605         break;
 606       case AMDGPUIntrinsic::R600_ddy:
 607         TextureOp = 9;
 608         break;
 609       default:
 610         llvm_unreachable("Unknow Texture Operation");
 611       }
 612
 613       SDValue TexArgs[19] = {
 614         DAG.getConstant(TextureOp, MVT::i32),
 615         Op.getOperand(1),
 616         DAG.getConstant(0, MVT::i32),
 617         DAG.getConstant(1, MVT::i32),
 618         DAG.getConstant(2, MVT::i32),
 619         DAG.getConstant(3, MVT::i32),
 620         Op.getOperand(2),
 621         Op.getOperand(3),
 622         Op.getOperand(4),
 623         DAG.getConstant(0, MVT::i32),
 624         DAG.getConstant(1, MVT::i32),
 625         DAG.getConstant(2, MVT::i32),
 626         DAG.getConstant(3, MVT::i32),
 627         Op.getOperand(5),
 628         Op.getOperand(6),
 629         Op.getOperand(7),
 630         Op.getOperand(8),
 631         Op.getOperand(9),
 632         Op.getOperand(10)
 633       };
 634       return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, DL, MVT::v4f32, TexArgs, 19);
 635     }
 636     case AMDGPUIntrinsic::AMDGPU_dp4: {
 637       SDValue Args[8] = {
 638       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
 639           DAG.getConstant(0, MVT::i32)),
 640       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
 641           DAG.getConstant(0, MVT::i32)),
 642       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
 643           DAG.getConstant(1, MVT::i32)),
 644       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
 645           DAG.getConstant(1, MVT::i32)),
 646       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
 647           DAG.getConstant(2, MVT::i32)),
 648       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
 649           DAG.getConstant(2, MVT::i32)),
 650       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
 651           DAG.getConstant(3, MVT::i32)),
 652       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
 653           DAG.getConstant(3, MVT::i32))
 654       };
 655       return DAG.getNode(AMDGPUISD::DOT4, DL, MVT::f32, Args, 8);
 656     }
 657
 658     case Intrinsic::r600_read_ngroups_x:
 659       return LowerImplicitParameter(DAG, VT, DL, 0);
 660     case Intrinsic::r600_read_ngroups_y:
 661       return LowerImplicitParameter(DAG, VT, DL, 1);
 662     case Intrinsic::r600_read_ngroups_z:
 663       return LowerImplicitParameter(DAG, VT, DL, 2);
 664     case Intrinsic::r600_read_global_size_x:
 665       return LowerImplicitParameter(DAG, VT, DL, 3);
 666     case Intrinsic::r600_read_global_size_y:
 667       return LowerImplicitParameter(DAG, VT, DL, 4);
 668     case Intrinsic::r600_read_global_size_z:
 669       return LowerImplicitParameter(DAG, VT, DL, 5);
 670     case Intrinsic::r600_read_local_size_x:
 671       return LowerImplicitParameter(DAG, VT, DL, 6);
 672     case Intrinsic::r600_read_local_size_y:
 673       return LowerImplicitParameter(DAG, VT, DL, 7);
 674     case Intrinsic::r600_read_local_size_z:
 675       return LowerImplicitParameter(DAG, VT, DL, 8);
 676
 677     case Intrinsic::r600_read_tgid_x:
 678       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 679                                   AMDGPU::T1_X, VT);
 680     case Intrinsic::r600_read_tgid_y:
 681       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 682                                   AMDGPU::T1_Y, VT);
 683     case Intrinsic::r600_read_tgid_z:
 684       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 685                                   AMDGPU::T1_Z, VT);
 686     case Intrinsic::r600_read_tidig_x:
 687       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 688                                   AMDGPU::T0_X, VT);
 689     case Intrinsic::r600_read_tidig_y:
 690       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 691                                   AMDGPU::T0_Y, VT);
 692     case Intrinsic::r600_read_tidig_z:
 693       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 694                                   AMDGPU::T0_Z, VT);
 695     }
 696     // break out of case ISD::INTRINSIC_WO_CHAIN in switch(Op.getOpcode())
 697     break;
 698   }
 699   } // end switch(Op.getOpcode())
 700   return SDValue();
 701 }
 702
 703 void R600TargetLowering::ReplaceNodeResults(SDNode *N,
 704                                             SmallVectorImpl<SDValue> &Results,
 705                                             SelectionDAG &DAG) const {
 706   switch (N->getOpcode()) {
 707   default: return;
 708   case ISD::FP_TO_UINT: Results.push_back(LowerFPTOUINT(N->getOperand(0), DAG));
 709     return;
 710   case ISD::LOAD: {
 711     SDNode *Node = LowerLOAD(SDValue(N, 0), DAG).getNode();
 712     Results.push_back(SDValue(Node, 0));
 713     Results.push_back(SDValue(Node, 1));
 714     // XXX: LLVM seems not to replace Chain Value inside CustomWidenLowerNode
 715     // function
 716     DAG.ReplaceAllUsesOfValueWith(SDValue(N,1), SDValue(Node, 1));
 717     return;
 718   }
 719   case ISD::STORE:
 720     SDNode *Node = LowerSTORE(SDValue(N, 0), DAG).getNode();
 721     Results.push_back(SDValue(Node, 0));
 722     return;
 723   }
 724 }
 725
 726 SDValue R600TargetLowering::LowerFPTOUINT(SDValue Op, SelectionDAG &DAG) const {
 727   return DAG.getNode(
 728       ISD::SETCC,
 729       SDLoc(Op),
 730       MVT::i1,
 731       Op, DAG.getConstantFP(0.0f, MVT::f32),
 732       DAG.getCondCode(ISD::SETNE)
 733       );
 734 }
 735
 736 SDValue R600TargetLowering::LowerImplicitParameter(SelectionDAG &DAG, EVT VT,
 737                                                    SDLoc DL,
 738                                                    unsigned DwordOffset) const {
 739   unsigned ByteOffset = DwordOffset * 4;
 740   PointerType * PtrType = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
 741                                       AMDGPUAS::PARAM_I_ADDRESS);
 742
 743   // We shouldn't be using an offset wider than 16-bits for implicit parameters.
 744   assert(isInt<16>(ByteOffset));
 745
 746   return DAG.getLoad(VT, DL, DAG.getEntryNode(),
 747                      DAG.getConstant(ByteOffset, MVT::i32), // PTR
 748                      MachinePointerInfo(ConstantPointerNull::get(PtrType)),
 749                      false, false, false, 0);
 750 }
 751
 752 SDValue R600TargetLowering::LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const {
 753
 754   MachineFunction &MF = DAG.getMachineFunction();
 755   const AMDGPUFrameLowering *TFL =
 756    static_cast<const AMDGPUFrameLowering*>(getTargetMachine().getFrameLowering());
 757
 758   FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Op);
 759   assert(FIN);
 760
 761   unsigned FrameIndex = FIN->getIndex();
 762   unsigned Offset = TFL->getFrameIndexOffset(MF, FrameIndex);
 763   return DAG.getConstant(Offset * 4 * TFL->getStackWidth(MF), MVT::i32);
 764 }
 765
 766 bool R600TargetLowering::isZero(SDValue Op) const {
 767   if(ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
 768     return Cst->isNullValue();
 769   } else if(ConstantFPSDNode *CstFP = dyn_cast<ConstantFPSDNode>(Op)){
 770     return CstFP->isZero();
 771   } else {
 772     return false;
 773   }
 774 }
 775
 776 SDValue R600TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
 777   SDLoc DL(Op);
 778   EVT VT = Op.getValueType();
 779
 780   SDValue LHS = Op.getOperand(0);
 781   SDValue RHS = Op.getOperand(1);
 782   SDValue True = Op.getOperand(2);
 783   SDValue False = Op.getOperand(3);
 784   SDValue CC = Op.getOperand(4);
 785   SDValue Temp;
 786
 787   // LHS and RHS are guaranteed to be the same value type
 788   EVT CompareVT = LHS.getValueType();
 789
 790   // Check if we can lower this to a native operation.
 791
 792   // Try to lower to a SET* instruction:
 793   //
 794   // SET* can match the following patterns:
 795   //
 796   // select_cc f32, f32, -1,  0, cc_any
 797   // select_cc f32, f32, 1.0f, 0.0f, cc_any
 798   // select_cc i32, i32, -1,  0, cc_any
 799   //
 800
 801   // Move hardware True/False values to the correct operand.
 802   if (isHWTrueValue(False) && isHWFalseValue(True)) {
 803     ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
 804     std::swap(False, True);
 805     CC = DAG.getCondCode(ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32));
 806   }
 807
 808   if (isHWTrueValue(True) && isHWFalseValue(False) &&
 809       (CompareVT == VT || VT == MVT::i32)) {
 810     // This can be matched by a SET* instruction.
 811     return DAG.getNode(ISD::SELECT_CC, DL, VT, LHS, RHS, True, False, CC);
 812   }
 813
 814   // Try to lower to a CND* instruction:
 815   //
 816   // CND* can match the following patterns:
 817   //
 818   // select_cc f32, 0.0, f32, f32, cc_any
 819   // select_cc f32, 0.0, i32, i32, cc_any
 820   // select_cc i32, 0,   f32, f32, cc_any
 821   // select_cc i32, 0,   i32, i32, cc_any
 822   //
 823   if (isZero(LHS) || isZero(RHS)) {
 824     SDValue Cond = (isZero(LHS) ? RHS : LHS);
 825     SDValue Zero = (isZero(LHS) ? LHS : RHS);
 826     ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
 827     if (CompareVT != VT) {
 828       // Bitcast True / False to the correct types.  This will end up being
 829       // a nop, but it allows us to define only a single pattern in the
 830       // .TD files for each CND* instruction rather than having to have
 831       // one pattern for integer True/False and one for fp True/False
 832       True = DAG.getNode(ISD::BITCAST, DL, CompareVT, True);
 833       False = DAG.getNode(ISD::BITCAST, DL, CompareVT, False);
 834     }
 835     if (isZero(LHS)) {
 836       CCOpcode = ISD::getSetCCSwappedOperands(CCOpcode);
 837     }
 838
 839     switch (CCOpcode) {
 840     case ISD::SETONE:
 841     case ISD::SETUNE:
 842     case ISD::SETNE:
 843     case ISD::SETULE:
 844     case ISD::SETULT:
 845     case ISD::SETOLE:
 846     case ISD::SETOLT:
 847     case ISD::SETLE:
 848     case ISD::SETLT:
 849       CCOpcode = ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32);
 850       Temp = True;
 851       True = False;
 852       False = Temp;
 853       break;
 854     default:
 855       break;
 856     }
 857     SDValue SelectNode = DAG.getNode(ISD::SELECT_CC, DL, CompareVT,
 858         Cond, Zero,
 859         True, False,
 860         DAG.getCondCode(CCOpcode));
 861     return DAG.getNode(ISD::BITCAST, DL, VT, SelectNode);
 862   }
 863
 864
 865   // Possible Min/Max pattern
 866   SDValue MinMax = LowerMinMax(Op, DAG);
 867   if (MinMax.getNode()) {
 868     return MinMax;
 869   }
 870
 871   // If we make it this for it means we have no native instructions to handle
 872   // this SELECT_CC, so we must lower it.
 873   SDValue HWTrue, HWFalse;
 874
 875   if (CompareVT == MVT::f32) {
 876     HWTrue = DAG.getConstantFP(1.0f, CompareVT);
 877     HWFalse = DAG.getConstantFP(0.0f, CompareVT);
 878   } else if (CompareVT == MVT::i32) {
 879     HWTrue = DAG.getConstant(-1, CompareVT);
 880     HWFalse = DAG.getConstant(0, CompareVT);
 881   }
 882   else {
 883     assert(!"Unhandled value type in LowerSELECT_CC");
 884   }
 885
 886   // Lower this unsupported SELECT_CC into a combination of two supported
 887   // SELECT_CC operations.
 888   SDValue Cond = DAG.getNode(ISD::SELECT_CC, DL, CompareVT, LHS, RHS, HWTrue, HWFalse, CC);
 889
 890   return DAG.getNode(ISD::SELECT_CC, DL, VT,
 891       Cond, HWFalse,
 892       True, False,
 893       DAG.getCondCode(ISD::SETNE));
 894 }
 895
 896 SDValue R600TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
 897   return DAG.getNode(ISD::SELECT_CC,
 898       SDLoc(Op),
 899       Op.getValueType(),
 900       Op.getOperand(0),
 901       DAG.getConstant(0, MVT::i32),
 902       Op.getOperand(1),
 903       Op.getOperand(2),
 904       DAG.getCondCode(ISD::SETNE));
 905 }
 906
 907 /// LLVM generates byte-addresed pointers.  For indirect addressing, we need to
 908 /// convert these pointers to a register index.  Each register holds
 909 /// 16 bytes, (4 x 32bit sub-register), but we need to take into account the
 910 /// \p StackWidth, which tells us how many of the 4 sub-registrers will be used
 911 /// for indirect addressing.
 912 SDValue R600TargetLowering::stackPtrToRegIndex(SDValue Ptr,
 913                                                unsigned StackWidth,
 914                                                SelectionDAG &DAG) const {
 915   unsigned SRLPad;
 916   switch(StackWidth) {
 917   case 1:
 918     SRLPad = 2;
 919     break;
 920   case 2:
 921     SRLPad = 3;
 922     break;
 923   case 4:
 924     SRLPad = 4;
 925     break;
 926   default: llvm_unreachable("Invalid stack width");
 927   }
 928
 929   return DAG.getNode(ISD::SRL, SDLoc(Ptr), Ptr.getValueType(), Ptr,
 930                      DAG.getConstant(SRLPad, MVT::i32));
 931 }
 932
 933 void R600TargetLowering::getStackAddress(unsigned StackWidth,
 934                                          unsigned ElemIdx,
 935                                          unsigned &Channel,
 936                                          unsigned &PtrIncr) const {
 937   switch (StackWidth) {
 938   default:
 939   case 1:
 940     Channel = 0;
 941     if (ElemIdx > 0) {
 942       PtrIncr = 1;
 943     } else {
 944       PtrIncr = 0;
 945     }
 946     break;
 947   case 2:
 948     Channel = ElemIdx % 2;
 949     if (ElemIdx == 2) {
 950       PtrIncr = 1;
 951     } else {
 952       PtrIncr = 0;
 953     }
 954     break;
 955   case 4:
 956     Channel = ElemIdx;
 957     PtrIncr = 0;
 958     break;
 959   }
 960 }
 961
 962 SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
 963   SDLoc DL(Op);
 964   StoreSDNode *StoreNode = cast<StoreSDNode>(Op);
 965   SDValue Chain = Op.getOperand(0);
 966   SDValue Value = Op.getOperand(1);
 967   SDValue Ptr = Op.getOperand(2);
 968
 969   if (StoreNode->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS &&
 970       Ptr->getOpcode() != AMDGPUISD::DWORDADDR) {
 971     // Convert pointer from byte address to dword address.
 972     Ptr = DAG.getNode(AMDGPUISD::DWORDADDR, DL, Ptr.getValueType(),
 973                       DAG.getNode(ISD::SRL, DL, Ptr.getValueType(),
 974                                   Ptr, DAG.getConstant(2, MVT::i32)));
 975
 976     if (StoreNode->isTruncatingStore() || StoreNode->isIndexed()) {
 977       assert(!"Truncated and indexed stores not supported yet");
 978     } else {
 979       Chain = DAG.getStore(Chain, DL, Value, Ptr, StoreNode->getMemOperand());
 980     }
 981     return Chain;
 982   }
 983
 984   EVT ValueVT = Value.getValueType();
 985
 986   if (StoreNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) {
 987     return SDValue();
 988   }
 989
 990   // Lowering for indirect addressing
 991
 992   const MachineFunction &MF = DAG.getMachineFunction();
 993   const AMDGPUFrameLowering *TFL = static_cast<const AMDGPUFrameLowering*>(
 994                                          getTargetMachine().getFrameLowering());
 995   unsigned StackWidth = TFL->getStackWidth(MF);
 996
 997   Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG);
 998
 999   if (ValueVT.isVector()) {
1000     unsigned NumElemVT = ValueVT.getVectorNumElements();
1001     EVT ElemVT = ValueVT.getVectorElementType();
1002     SDValue Stores[4];
1003
1004     assert(NumElemVT >= StackWidth && "Stack width cannot be greater than "
1005                                       "vector width in load");
1006
1007     for (unsigned i = 0; i < NumElemVT; ++i) {
1008       unsigned Channel, PtrIncr;
1009       getStackAddress(StackWidth, i, Channel, PtrIncr);
1010       Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr,
1011                         DAG.getConstant(PtrIncr, MVT::i32));
1012       SDValue Elem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ElemVT,
1013                                  Value, DAG.getConstant(i, MVT::i32));
1014
1015       Stores[i] = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other,
1016                               Chain, Elem, Ptr,
1017                               DAG.getTargetConstant(Channel, MVT::i32));
1018     }
1019      Chain =  DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores, NumElemVT);
1020    } else {
1021     if (ValueVT == MVT::i8) {
1022       Value = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Value);
1023     }
1024     Chain = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other, Chain, Value, Ptr,
1025     DAG.getTargetConstant(0, MVT::i32)); // Channel
1026   }
1027
1028   return Chain;
1029 }
1030
1031 // return (512 + (kc_bank << 12)
1032 static int
1033 ConstantAddressBlock(unsigned AddressSpace) {
1034   switch (AddressSpace) {
1035   case AMDGPUAS::CONSTANT_BUFFER_0:
1036     return 512;
1037   case AMDGPUAS::CONSTANT_BUFFER_1:
1038     return 512 + 4096;
1039   case AMDGPUAS::CONSTANT_BUFFER_2:
1040     return 512 + 4096 * 2;
1041   case AMDGPUAS::CONSTANT_BUFFER_3:
1042     return 512 + 4096 * 3;
1043   case AMDGPUAS::CONSTANT_BUFFER_4:
1044     return 512 + 4096 * 4;
1045   case AMDGPUAS::CONSTANT_BUFFER_5:
1046     return 512 + 4096 * 5;
1047   case AMDGPUAS::CONSTANT_BUFFER_6:
1048     return 512 + 4096 * 6;
1049   case AMDGPUAS::CONSTANT_BUFFER_7:
1050     return 512 + 4096 * 7;
1051   case AMDGPUAS::CONSTANT_BUFFER_8:
1052     return 512 + 4096 * 8;
1053   case AMDGPUAS::CONSTANT_BUFFER_9:
1054     return 512 + 4096 * 9;
1055   case AMDGPUAS::CONSTANT_BUFFER_10:
1056     return 512 + 4096 * 10;
1057   case AMDGPUAS::CONSTANT_BUFFER_11:
1058     return 512 + 4096 * 11;
1059   case AMDGPUAS::CONSTANT_BUFFER_12:
1060     return 512 + 4096 * 12;
1061   case AMDGPUAS::CONSTANT_BUFFER_13:
1062     return 512 + 4096 * 13;
1063   case AMDGPUAS::CONSTANT_BUFFER_14:
1064     return 512 + 4096 * 14;
1065   case AMDGPUAS::CONSTANT_BUFFER_15:
1066     return 512 + 4096 * 15;
1067   default:
1068     return -1;
1069   }
1070 }
1071
1072 SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const
1073 {
1074   EVT VT = Op.getValueType();
1075   SDLoc DL(Op);
1076   LoadSDNode *LoadNode = cast<LoadSDNode>(Op);
1077   SDValue Chain = Op.getOperand(0);
1078   SDValue Ptr = Op.getOperand(1);
1079   SDValue LoweredLoad;
1080
1081   int ConstantBlock = ConstantAddressBlock(LoadNode->getAddressSpace());
1082   if (ConstantBlock > -1) {
1083     SDValue Result;
1084     if (dyn_cast<ConstantExpr>(LoadNode->getSrcValue()) ||
1085         dyn_cast<Constant>(LoadNode->getSrcValue()) ||
1086         dyn_cast<ConstantSDNode>(Ptr)) {
1087       SDValue Slots[4];
1088       for (unsigned i = 0; i < 4; i++) {
1089         // We want Const position encoded with the following formula :
1090         // (((512 + (kc_bank << 12) + const_index) << 2) + chan)
1091         // const_index is Ptr computed by llvm using an alignment of 16.
1092         // Thus we add (((512 + (kc_bank << 12)) + chan ) * 4 here and
1093         // then div by 4 at the ISel step
1094         SDValue NewPtr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
1095             DAG.getConstant(4 * i + ConstantBlock * 16, MVT::i32));
1096         Slots[i] = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::i32, NewPtr);
1097       }
1098       Result = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v4i32, Slots, 4);
1099     } else {
1100       // non constant ptr cant be folded, keeps it as a v4f32 load
1101       Result = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::v4i32,
1102           DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr, DAG.getConstant(4, MVT::i32)),
1103           DAG.getConstant(LoadNode->getAddressSpace() -
1104                           AMDGPUAS::CONSTANT_BUFFER_0, MVT::i32)
1105           );
1106     }
1107
1108     if (!VT.isVector()) {
1109       Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Result,
1110           DAG.getConstant(0, MVT::i32));
1111     }
1112
1113     SDValue MergedValues[2] = {
1114         Result,
1115         Chain
1116     };
1117     return DAG.getMergeValues(MergedValues, 2, DL);
1118   }
1119
1120   if (LoadNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) {
1121     return SDValue();
1122   }
1123
1124   // Lowering for indirect addressing
1125   const MachineFunction &MF = DAG.getMachineFunction();
1126   const AMDGPUFrameLowering *TFL = static_cast<const AMDGPUFrameLowering*>(
1127                                          getTargetMachine().getFrameLowering());
1128   unsigned StackWidth = TFL->getStackWidth(MF);
1129
1130   Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG);
1131
1132   if (VT.isVector()) {
1133     unsigned NumElemVT = VT.getVectorNumElements();
1134     EVT ElemVT = VT.getVectorElementType();
1135     SDValue Loads[4];
1136
1137     assert(NumElemVT >= StackWidth && "Stack width cannot be greater than "
1138                                       "vector width in load");
1139
1140     for (unsigned i = 0; i < NumElemVT; ++i) {
1141       unsigned Channel, PtrIncr;
1142       getStackAddress(StackWidth, i, Channel, PtrIncr);
1143       Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr,
1144                         DAG.getConstant(PtrIncr, MVT::i32));
1145       Loads[i] = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, ElemVT,
1146                              Chain, Ptr,
1147                              DAG.getTargetConstant(Channel, MVT::i32),
1148                              Op.getOperand(2));
1149     }
1150     for (unsigned i = NumElemVT; i < 4; ++i) {
1151       Loads[i] = DAG.getUNDEF(ElemVT);
1152     }
1153     EVT TargetVT = EVT::getVectorVT(*DAG.getContext(), ElemVT, 4);
1154     LoweredLoad = DAG.getNode(ISD::BUILD_VECTOR, DL, TargetVT, Loads, 4);
1155   } else {
1156     LoweredLoad = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, VT,
1157                               Chain, Ptr,
1158                               DAG.getTargetConstant(0, MVT::i32), // Channel
1159                               Op.getOperand(2));
1160   }
1161
1162   SDValue Ops[2];
1163   Ops[0] = LoweredLoad;
1164   Ops[1] = Chain;
1165
1166   return DAG.getMergeValues(Ops, 2, DL);
1167 }
1168
1169 /// XXX Only kernel functions are supported, so we can assume for now that
1170 /// every function is a kernel function, but in the future we should use
1171 /// separate calling conventions for kernel and non-kernel functions.
1172 SDValue R600TargetLowering::LowerFormalArguments(
1173                                       SDValue Chain,
1174                                       CallingConv::ID CallConv,
1175                                       bool isVarArg,
1176                                       const SmallVectorImpl<ISD::InputArg> &Ins,
1177                                       SDLoc DL, SelectionDAG &DAG,
1178                                       SmallVectorImpl<SDValue> &InVals) const {
1179   unsigned ParamOffsetBytes = 36;
1180   Function::const_arg_iterator FuncArg =
1181                             DAG.getMachineFunction().getFunction()->arg_begin();
1182   for (unsigned i = 0, e = Ins.size(); i < e; ++i, ++FuncArg) {
1183     EVT VT = Ins[i].VT;
1184     Type *ArgType = FuncArg->getType();
1185     unsigned ArgSizeInBits = ArgType->isPointerTy() ?
1186                              32 : ArgType->getPrimitiveSizeInBits();
1187     unsigned ArgBytes = ArgSizeInBits >> 3;
1188     EVT ArgVT;
1189     if (ArgSizeInBits < VT.getSizeInBits()) {
1190       assert(!ArgType->isFloatTy() &&
1191              "Extending floating point arguments not supported yet");
1192       ArgVT = MVT::getIntegerVT(ArgSizeInBits);
1193     } else {
1194       ArgVT = VT;
1195     }
1196     PointerType *PtrTy = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
1197                                                     AMDGPUAS::PARAM_I_ADDRESS);
1198     SDValue Arg = DAG.getExtLoad(ISD::ZEXTLOAD, DL, VT, DAG.getRoot(),
1199                                 DAG.getConstant(ParamOffsetBytes, MVT::i32),
1200                                        MachinePointerInfo(UndefValue::get(PtrTy)),
1201                                        ArgVT, false, false, ArgBytes);
1202     InVals.push_back(Arg);
1203     ParamOffsetBytes += ArgBytes;
1204   }
1205   return Chain;
1206 }
1207
1208 EVT R600TargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const {
1209    if (!VT.isVector()) return MVT::i32;
1210    return VT.changeVectorElementTypeToInteger();
1211 }
1212
1213 //===----------------------------------------------------------------------===//
1214 // Custom DAG Optimizations
1215 //===----------------------------------------------------------------------===//
1216
1217 SDValue R600TargetLowering::PerformDAGCombine(SDNode *N,
1218                                               DAGCombinerInfo &DCI) const {
1219   SelectionDAG &DAG = DCI.DAG;
1220
1221   switch (N->getOpcode()) {
1222   // (f32 fp_round (f64 uint_to_fp a)) -> (f32 uint_to_fp a)
1223   case ISD::FP_ROUND: {
1224       SDValue Arg = N->getOperand(0);
1225       if (Arg.getOpcode() == ISD::UINT_TO_FP && Arg.getValueType() == MVT::f64) {
1226         return DAG.getNode(ISD::UINT_TO_FP, SDLoc(N), N->getValueType(0),
1227                            Arg.getOperand(0));
1228       }
1229       break;
1230     }
1231
1232   // (i32 fp_to_sint (fneg (select_cc f32, f32, 1.0, 0.0 cc))) ->
1233   // (i32 select_cc f32, f32, -1, 0 cc)
1234   //
1235   // Mesa's GLSL frontend generates the above pattern a lot and we can lower
1236   // this to one of the SET*_DX10 instructions.
1237   case ISD::FP_TO_SINT: {
1238     SDValue FNeg = N->getOperand(0);
1239     if (FNeg.getOpcode() != ISD::FNEG) {
1240       return SDValue();
1241     }
1242     SDValue SelectCC = FNeg.getOperand(0);
1243     if (SelectCC.getOpcode() != ISD::SELECT_CC ||
1244         SelectCC.getOperand(0).getValueType() != MVT::f32 || // LHS
1245         SelectCC.getOperand(2).getValueType() != MVT::f32 || // True
1246         !isHWTrueValue(SelectCC.getOperand(2)) ||
1247         !isHWFalseValue(SelectCC.getOperand(3))) {
1248       return SDValue();
1249     }
1250
1251     return DAG.getNode(ISD::SELECT_CC, SDLoc(N), N->getValueType(0),
1252                            SelectCC.getOperand(0), // LHS
1253                            SelectCC.getOperand(1), // RHS
1254                            DAG.getConstant(-1, MVT::i32), // True
1255                            DAG.getConstant(0, MVT::i32),  // Flase
1256                            SelectCC.getOperand(4)); // CC
1257
1258     break;
1259   }
1260   // Extract_vec (Build_vector) generated by custom lowering
1261   // also needs to be customly combined
1262   case ISD::EXTRACT_VECTOR_ELT: {
1263     SDValue Arg = N->getOperand(0);
1264     if (Arg.getOpcode() == ISD::BUILD_VECTOR) {
1265       if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
1266         unsigned Element = Const->getZExtValue();
1267         return Arg->getOperand(Element);
1268       }
1269     }
1270     if (Arg.getOpcode() == ISD::BITCAST &&
1271         Arg.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
1272       if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
1273         unsigned Element = Const->getZExtValue();
1274         return DAG.getNode(ISD::BITCAST, SDLoc(N), N->getVTList(),
1275             Arg->getOperand(0).getOperand(Element));
1276       }
1277     }
1278   }
1279
1280   case ISD::SELECT_CC: {
1281     // fold selectcc (selectcc x, y, a, b, cc), b, a, b, seteq ->
1282     //      selectcc x, y, a, b, inv(cc)
1283     //
1284     // fold selectcc (selectcc x, y, a, b, cc), b, a, b, setne ->
1285     //      selectcc x, y, a, b, cc
1286     SDValue LHS = N->getOperand(0);
1287     if (LHS.getOpcode() != ISD::SELECT_CC) {
1288       return SDValue();
1289     }
1290
1291     SDValue RHS = N->getOperand(1);
1292     SDValue True = N->getOperand(2);
1293     SDValue False = N->getOperand(3);
1294     ISD::CondCode NCC = cast<CondCodeSDNode>(N->getOperand(4))->get();
1295
1296     if (LHS.getOperand(2).getNode() != True.getNode() ||
1297         LHS.getOperand(3).getNode() != False.getNode() ||
1298         RHS.getNode() != False.getNode()) {
1299       return SDValue();
1300     }
1301
1302     switch (NCC) {
1303     default: return SDValue();
1304     case ISD::SETNE: return LHS;
1305     case ISD::SETEQ: {
1306       ISD::CondCode LHSCC = cast<CondCodeSDNode>(LHS.getOperand(4))->get();
1307       LHSCC = ISD::getSetCCInverse(LHSCC,
1308                                   LHS.getOperand(0).getValueType().isInteger());
1309       return DAG.getSelectCC(SDLoc(N),
1310                              LHS.getOperand(0),
1311                              LHS.getOperand(1),
1312                              LHS.getOperand(2),
1313                              LHS.getOperand(3),
1314                              LHSCC);
1315     }
1316     }
1317   }
1318   case AMDGPUISD::EXPORT: {
1319     SDValue Arg = N->getOperand(1);
1320     if (Arg.getOpcode() != ISD::BUILD_VECTOR)
1321       break;
1322     SDValue NewBldVec[4] = {
1323         DAG.getUNDEF(MVT::f32),
1324         DAG.getUNDEF(MVT::f32),
1325         DAG.getUNDEF(MVT::f32),
1326         DAG.getUNDEF(MVT::f32)
1327       };
1328     SDValue NewArgs[8] = {
1329       N->getOperand(0), // Chain
1330       SDValue(),
1331       N->getOperand(2), // ArrayBase
1332       N->getOperand(3), // Type
1333       N->getOperand(4), // SWZ_X
1334       N->getOperand(5), // SWZ_Y
1335       N->getOperand(6), // SWZ_Z
1336       N->getOperand(7) // SWZ_W
1337     };
1338     for (unsigned i = 0; i < Arg.getNumOperands(); i++) {
1339       if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Arg.getOperand(i))) {
1340         if (C->isZero()) {
1341           NewArgs[4 + i] = DAG.getConstant(4, MVT::i32); // SEL_0
1342         } else if (C->isExactlyValue(1.0)) {
1343           NewArgs[4 + i] = DAG.getConstant(5, MVT::i32); // SEL_0
1344         } else {
1345           NewBldVec[i] = Arg.getOperand(i);
1346         }
1347       } else {
1348         NewBldVec[i] = Arg.getOperand(i);
1349       }
1350     }
1351     SDLoc DL(N);
1352     NewArgs[1] = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v4f32, NewBldVec, 4);
1353     return DAG.getNode(AMDGPUISD::EXPORT, DL, N->getVTList(), NewArgs, 8);
1354   }
1355   }
1356   return SDValue();
1357 }