lib/Target/R600/R600ISelLowering.cpp

   1 //===-- R600ISelLowering.cpp - R600 DAG Lowering Implementation -----------===//
   2 //
   3 //                     The LLVM Compiler Infrastructure
   4 //
   5 // This file is distributed under the University of Illinois Open Source
   6 // License. See LICENSE.TXT for details.
   7 //
   8 //===----------------------------------------------------------------------===//
   9 //
  10 /// \file
  11 /// \brief Custom DAG lowering for R600
  12 //
  13 //===----------------------------------------------------------------------===//
  14
  15 #include "R600ISelLowering.h"
  16 #include "R600Defines.h"
  17 #include "R600InstrInfo.h"
  18 #include "R600MachineFunctionInfo.h"
  19 #include "llvm/CodeGen/MachineFrameInfo.h"
  20 #include "llvm/CodeGen/MachineInstrBuilder.h"
  21 #include "llvm/CodeGen/MachineRegisterInfo.h"
  22 #include "llvm/CodeGen/SelectionDAG.h"
  23 #include "llvm/IR/Argument.h"
  24 #include "llvm/IR/Function.h"
  25
  26 using namespace llvm;
  27
  28 R600TargetLowering::R600TargetLowering(TargetMachine &TM) :
  29     AMDGPUTargetLowering(TM) {
  30   addRegisterClass(MVT::v4f32, &AMDGPU::R600_Reg128RegClass);
  31   addRegisterClass(MVT::f32, &AMDGPU::R600_Reg32RegClass);
  32   addRegisterClass(MVT::v4i32, &AMDGPU::R600_Reg128RegClass);
  33   addRegisterClass(MVT::i32, &AMDGPU::R600_Reg32RegClass);
  34   computeRegisterProperties();
  35
  36   setOperationAction(ISD::FADD, MVT::v4f32, Expand);
  37   setOperationAction(ISD::FMUL, MVT::v4f32, Expand);
  38   setOperationAction(ISD::FDIV, MVT::v4f32, Expand);
  39   setOperationAction(ISD::FSUB, MVT::v4f32, Expand);
  40
  41   setOperationAction(ISD::ADD,  MVT::v4i32, Expand);
  42   setOperationAction(ISD::AND,  MVT::v4i32, Expand);
  43   setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Expand);
  44   setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Expand);
  45   setOperationAction(ISD::MUL,  MVT::v2i32, Expand);
  46   setOperationAction(ISD::MUL,  MVT::v4i32, Expand);
  47   setOperationAction(ISD::OR, MVT::v4i32, Expand);
  48   setOperationAction(ISD::OR, MVT::v2i32, Expand);
  49   setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Expand);
  50   setOperationAction(ISD::SHL, MVT::v4i32, Expand);
  51   setOperationAction(ISD::SHL, MVT::v2i32, Expand);
  52   setOperationAction(ISD::SRL, MVT::v4i32, Expand);
  53   setOperationAction(ISD::SRL, MVT::v2i32, Expand);
  54   setOperationAction(ISD::SRA, MVT::v4i32, Expand);
  55   setOperationAction(ISD::SRA, MVT::v2i32, Expand);
  56   setOperationAction(ISD::SUB, MVT::v4i32, Expand);
  57   setOperationAction(ISD::SUB, MVT::v2i32, Expand);
  58   setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Expand);
  59   setOperationAction(ISD::UDIV, MVT::v4i32, Expand);
  60   setOperationAction(ISD::UREM, MVT::v4i32, Expand);
  61   setOperationAction(ISD::SETCC, MVT::v4i32, Expand);
  62   setOperationAction(ISD::XOR, MVT::v4i32, Expand);
  63   setOperationAction(ISD::XOR, MVT::v2i32, Expand);
  64
  65   setOperationAction(ISD::BR_CC, MVT::i32, Expand);
  66   setOperationAction(ISD::BR_CC, MVT::f32, Expand);
  67
  68   setOperationAction(ISD::FSUB, MVT::f32, Expand);
  69
  70   setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
  71   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
  72   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i1, Custom);
  73
  74   setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
  75   setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
  76
  77   setOperationAction(ISD::SETCC, MVT::i32, Expand);
  78   setOperationAction(ISD::SETCC, MVT::f32, Expand);
  79   setOperationAction(ISD::FP_TO_UINT, MVT::i1, Custom);
  80
  81   setOperationAction(ISD::SELECT, MVT::i32, Custom);
  82   setOperationAction(ISD::SELECT, MVT::f32, Custom);
  83
  84   setOperationAction(ISD::VSELECT, MVT::v4i32, Expand);
  85   setOperationAction(ISD::VSELECT, MVT::v2i32, Expand);
  86
  87   // Legalize loads and stores to the private address space.
  88   setOperationAction(ISD::LOAD, MVT::i32, Custom);
  89   setOperationAction(ISD::LOAD, MVT::v2i32, Expand);
  90   setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
  91   setLoadExtAction(ISD::EXTLOAD, MVT::v4i8, Custom);
  92   setLoadExtAction(ISD::EXTLOAD, MVT::i8, Custom);
  93   setLoadExtAction(ISD::ZEXTLOAD, MVT::i8, Custom);
  94   setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i8, Custom);
  95   setOperationAction(ISD::STORE, MVT::i8, Custom);
  96   setOperationAction(ISD::STORE, MVT::i32, Custom);
  97   setOperationAction(ISD::STORE, MVT::v2i32, Expand);
  98   setOperationAction(ISD::STORE, MVT::v4i32, Custom);
  99
 100   setOperationAction(ISD::LOAD, MVT::i32, Custom);
 101   setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
 102   setOperationAction(ISD::FrameIndex, MVT::i32, Custom);
 103
 104   setTargetDAGCombine(ISD::FP_ROUND);
 105   setTargetDAGCombine(ISD::FP_TO_SINT);
 106   setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
 107   setTargetDAGCombine(ISD::SELECT_CC);
 108
 109   setBooleanContents(ZeroOrNegativeOneBooleanContent);
 110   setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
 111   setSchedulingPreference(Sched::VLIW);
 112 }
 113
 114 MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter(
 115     MachineInstr * MI, MachineBasicBlock * BB) const {
 116   MachineFunction * MF = BB->getParent();
 117   MachineRegisterInfo &MRI = MF->getRegInfo();
 118   MachineBasicBlock::iterator I = *MI;
 119   const R600InstrInfo *TII =
 120     static_cast<const R600InstrInfo*>(MF->getTarget().getInstrInfo());
 121
 122   switch (MI->getOpcode()) {
 123   default: return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
 124   case AMDGPU::CLAMP_R600: {
 125     MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
 126                                                    AMDGPU::MOV,
 127                                                    MI->getOperand(0).getReg(),
 128                                                    MI->getOperand(1).getReg());
 129     TII->addFlag(NewMI, 0, MO_FLAG_CLAMP);
 130     break;
 131   }
 132
 133   case AMDGPU::FABS_R600: {
 134     MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
 135                                                     AMDGPU::MOV,
 136                                                     MI->getOperand(0).getReg(),
 137                                                     MI->getOperand(1).getReg());
 138     TII->addFlag(NewMI, 0, MO_FLAG_ABS);
 139     break;
 140   }
 141
 142   case AMDGPU::FNEG_R600: {
 143     MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
 144                                                     AMDGPU::MOV,
 145                                                     MI->getOperand(0).getReg(),
 146                                                     MI->getOperand(1).getReg());
 147     TII->addFlag(NewMI, 0, MO_FLAG_NEG);
 148     break;
 149   }
 150
 151   case AMDGPU::MASK_WRITE: {
 152     unsigned maskedRegister = MI->getOperand(0).getReg();
 153     assert(TargetRegisterInfo::isVirtualRegister(maskedRegister));
 154     MachineInstr * defInstr = MRI.getVRegDef(maskedRegister);
 155     TII->addFlag(defInstr, 0, MO_FLAG_MASK);
 156     break;
 157   }
 158
 159   case AMDGPU::MOV_IMM_F32:
 160     TII->buildMovImm(*BB, I, MI->getOperand(0).getReg(),
 161                      MI->getOperand(1).getFPImm()->getValueAPF()
 162                          .bitcastToAPInt().getZExtValue());
 163     break;
 164   case AMDGPU::MOV_IMM_I32:
 165     TII->buildMovImm(*BB, I, MI->getOperand(0).getReg(),
 166                      MI->getOperand(1).getImm());
 167     break;
 168   case AMDGPU::CONST_COPY: {
 169     MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, MI, AMDGPU::MOV,
 170         MI->getOperand(0).getReg(), AMDGPU::ALU_CONST);
 171     TII->setImmOperand(NewMI, R600Operands::SRC0_SEL,
 172         MI->getOperand(1).getImm());
 173     break;
 174   }
 175
 176   case AMDGPU::RAT_WRITE_CACHELESS_32_eg:
 177   case AMDGPU::RAT_WRITE_CACHELESS_128_eg: {
 178     unsigned EOP = (llvm::next(I)->getOpcode() == AMDGPU::RETURN) ? 1 : 0;
 179
 180     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode()))
 181             .addOperand(MI->getOperand(0))
 182             .addOperand(MI->getOperand(1))
 183             .addImm(EOP); // Set End of program bit
 184     break;
 185   }
 186
 187   case AMDGPU::TXD: {
 188     unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
 189     unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
 190     MachineOperand &RID = MI->getOperand(4);
 191     MachineOperand &SID = MI->getOperand(5);
 192     unsigned TextureId = MI->getOperand(6).getImm();
 193     unsigned SrcX = 0, SrcY = 1, SrcZ = 2, SrcW = 3;
 194     unsigned CTX = 1, CTY = 1, CTZ = 1, CTW = 1;
 195
 196     switch (TextureId) {
 197     case 5: // Rect
 198       CTX = CTY = 0;
 199       break;
 200     case 6: // Shadow1D
 201       SrcW = SrcZ;
 202       break;
 203     case 7: // Shadow2D
 204       SrcW = SrcZ;
 205       break;
 206     case 8: // ShadowRect
 207       CTX = CTY = 0;
 208       SrcW = SrcZ;
 209       break;
 210     case 9: // 1DArray
 211       SrcZ = SrcY;
 212       CTZ = 0;
 213       break;
 214     case 10: // 2DArray
 215       CTZ = 0;
 216       break;
 217     case 11: // Shadow1DArray
 218       SrcZ = SrcY;
 219       CTZ = 0;
 220       break;
 221     case 12: // Shadow2DArray
 222       CTZ = 0;
 223       break;
 224     }
 225     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), T0)
 226             .addOperand(MI->getOperand(3))
 227             .addImm(SrcX)
 228             .addImm(SrcY)
 229             .addImm(SrcZ)
 230             .addImm(SrcW)
 231             .addImm(0)
 232             .addImm(0)
 233             .addImm(0)
 234             .addImm(0)
 235             .addImm(1)
 236             .addImm(2)
 237             .addImm(3)
 238             .addOperand(RID)
 239             .addOperand(SID)
 240             .addImm(CTX)
 241             .addImm(CTY)
 242             .addImm(CTZ)
 243             .addImm(CTW);
 244     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), T1)
 245             .addOperand(MI->getOperand(2))
 246             .addImm(SrcX)
 247             .addImm(SrcY)
 248             .addImm(SrcZ)
 249             .addImm(SrcW)
 250             .addImm(0)
 251             .addImm(0)
 252             .addImm(0)
 253             .addImm(0)
 254             .addImm(1)
 255             .addImm(2)
 256             .addImm(3)
 257             .addOperand(RID)
 258             .addOperand(SID)
 259             .addImm(CTX)
 260             .addImm(CTY)
 261             .addImm(CTZ)
 262             .addImm(CTW);
 263     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_G))
 264             .addOperand(MI->getOperand(0))
 265             .addOperand(MI->getOperand(1))
 266             .addImm(SrcX)
 267             .addImm(SrcY)
 268             .addImm(SrcZ)
 269             .addImm(SrcW)
 270             .addImm(0)
 271             .addImm(0)
 272             .addImm(0)
 273             .addImm(0)
 274             .addImm(1)
 275             .addImm(2)
 276             .addImm(3)
 277             .addOperand(RID)
 278             .addOperand(SID)
 279             .addImm(CTX)
 280             .addImm(CTY)
 281             .addImm(CTZ)
 282             .addImm(CTW)
 283             .addReg(T0, RegState::Implicit)
 284             .addReg(T1, RegState::Implicit);
 285     break;
 286   }
 287
 288   case AMDGPU::TXD_SHADOW: {
 289     unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
 290     unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
 291     MachineOperand &RID = MI->getOperand(4);
 292     MachineOperand &SID = MI->getOperand(5);
 293     unsigned TextureId = MI->getOperand(6).getImm();
 294     unsigned SrcX = 0, SrcY = 1, SrcZ = 2, SrcW = 3;
 295     unsigned CTX = 1, CTY = 1, CTZ = 1, CTW = 1;
 296
 297     switch (TextureId) {
 298     case 5: // Rect
 299       CTX = CTY = 0;
 300       break;
 301     case 6: // Shadow1D
 302       SrcW = SrcZ;
 303       break;
 304     case 7: // Shadow2D
 305       SrcW = SrcZ;
 306       break;
 307     case 8: // ShadowRect
 308       CTX = CTY = 0;
 309       SrcW = SrcZ;
 310       break;
 311     case 9: // 1DArray
 312       SrcZ = SrcY;
 313       CTZ = 0;
 314       break;
 315     case 10: // 2DArray
 316       CTZ = 0;
 317       break;
 318     case 11: // Shadow1DArray
 319       SrcZ = SrcY;
 320       CTZ = 0;
 321       break;
 322     case 12: // Shadow2DArray
 323       CTZ = 0;
 324       break;
 325     }
 326
 327     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), T0)
 328             .addOperand(MI->getOperand(3))
 329             .addImm(SrcX)
 330             .addImm(SrcY)
 331             .addImm(SrcZ)
 332             .addImm(SrcW)
 333             .addImm(0)
 334             .addImm(0)
 335             .addImm(0)
 336             .addImm(0)
 337             .addImm(1)
 338             .addImm(2)
 339             .addImm(3)
 340             .addOperand(RID)
 341             .addOperand(SID)
 342             .addImm(CTX)
 343             .addImm(CTY)
 344             .addImm(CTZ)
 345             .addImm(CTW);
 346     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), T1)
 347             .addOperand(MI->getOperand(2))
 348             .addImm(SrcX)
 349             .addImm(SrcY)
 350             .addImm(SrcZ)
 351             .addImm(SrcW)
 352             .addImm(0)
 353             .addImm(0)
 354             .addImm(0)
 355             .addImm(0)
 356             .addImm(1)
 357             .addImm(2)
 358             .addImm(3)
 359             .addOperand(RID)
 360             .addOperand(SID)
 361             .addImm(CTX)
 362             .addImm(CTY)
 363             .addImm(CTZ)
 364             .addImm(CTW);
 365     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_C_G))
 366             .addOperand(MI->getOperand(0))
 367             .addOperand(MI->getOperand(1))
 368             .addImm(SrcX)
 369             .addImm(SrcY)
 370             .addImm(SrcZ)
 371             .addImm(SrcW)
 372             .addImm(0)
 373             .addImm(0)
 374             .addImm(0)
 375             .addImm(0)
 376             .addImm(1)
 377             .addImm(2)
 378             .addImm(3)
 379             .addOperand(RID)
 380             .addOperand(SID)
 381             .addImm(CTX)
 382             .addImm(CTY)
 383             .addImm(CTZ)
 384             .addImm(CTW)
 385             .addReg(T0, RegState::Implicit)
 386             .addReg(T1, RegState::Implicit);
 387     break;
 388   }
 389
 390   case AMDGPU::BRANCH:
 391       BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP))
 392               .addOperand(MI->getOperand(0));
 393       break;
 394
 395   case AMDGPU::BRANCH_COND_f32: {
 396     MachineInstr *NewMI =
 397       BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X),
 398               AMDGPU::PREDICATE_BIT)
 399               .addOperand(MI->getOperand(1))
 400               .addImm(OPCODE_IS_NOT_ZERO)
 401               .addImm(0); // Flags
 402     TII->addFlag(NewMI, 0, MO_FLAG_PUSH);
 403     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND))
 404             .addOperand(MI->getOperand(0))
 405             .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
 406     break;
 407   }
 408
 409   case AMDGPU::BRANCH_COND_i32: {
 410     MachineInstr *NewMI =
 411       BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X),
 412             AMDGPU::PREDICATE_BIT)
 413             .addOperand(MI->getOperand(1))
 414             .addImm(OPCODE_IS_NOT_ZERO_INT)
 415             .addImm(0); // Flags
 416     TII->addFlag(NewMI, 0, MO_FLAG_PUSH);
 417     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND))
 418            .addOperand(MI->getOperand(0))
 419             .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
 420     break;
 421   }
 422
 423   case AMDGPU::EG_ExportSwz:
 424   case AMDGPU::R600_ExportSwz: {
 425     // Instruction is left unmodified if its not the last one of its type
 426     bool isLastInstructionOfItsType = true;
 427     unsigned InstExportType = MI->getOperand(1).getImm();
 428     for (MachineBasicBlock::iterator NextExportInst = llvm::next(I),
 429          EndBlock = BB->end(); NextExportInst != EndBlock;
 430          NextExportInst = llvm::next(NextExportInst)) {
 431       if (NextExportInst->getOpcode() == AMDGPU::EG_ExportSwz ||
 432           NextExportInst->getOpcode() == AMDGPU::R600_ExportSwz) {
 433         unsigned CurrentInstExportType = NextExportInst->getOperand(1)
 434             .getImm();
 435         if (CurrentInstExportType == InstExportType) {
 436           isLastInstructionOfItsType = false;
 437           break;
 438         }
 439       }
 440     }
 441     bool EOP = (llvm::next(I)->getOpcode() == AMDGPU::RETURN)? 1 : 0;
 442     if (!EOP && !isLastInstructionOfItsType)
 443       return BB;
 444     unsigned CfInst = (MI->getOpcode() == AMDGPU::EG_ExportSwz)? 84 : 40;
 445     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode()))
 446             .addOperand(MI->getOperand(0))
 447             .addOperand(MI->getOperand(1))
 448             .addOperand(MI->getOperand(2))
 449             .addOperand(MI->getOperand(3))
 450             .addOperand(MI->getOperand(4))
 451             .addOperand(MI->getOperand(5))
 452             .addOperand(MI->getOperand(6))
 453             .addImm(CfInst)
 454             .addImm(EOP);
 455     break;
 456   }
 457   case AMDGPU::RETURN: {
 458     // RETURN instructions must have the live-out registers as implicit uses,
 459     // otherwise they appear dead.
 460     R600MachineFunctionInfo *MFI = MF->getInfo<R600MachineFunctionInfo>();
 461     MachineInstrBuilder MIB(*MF, MI);
 462     for (unsigned i = 0, e = MFI->LiveOuts.size(); i != e; ++i)
 463       MIB.addReg(MFI->LiveOuts[i], RegState::Implicit);
 464     return BB;
 465   }
 466   }
 467
 468   MI->eraseFromParent();
 469   return BB;
 470 }
 471
 472 //===----------------------------------------------------------------------===//
 473 // Custom DAG Lowering Operations
 474 //===----------------------------------------------------------------------===//
 475
 476 SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
 477   switch (Op.getOpcode()) {
 478   default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
 479   case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
 480   case ISD::SELECT: return LowerSELECT(Op, DAG);
 481   case ISD::STORE: return LowerSTORE(Op, DAG);
 482   case ISD::LOAD: return LowerLOAD(Op, DAG);
 483   case ISD::FrameIndex: return LowerFrameIndex(Op, DAG);
 484   case ISD::INTRINSIC_VOID: {
 485     SDValue Chain = Op.getOperand(0);
 486     unsigned IntrinsicID =
 487                          cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
 488     switch (IntrinsicID) {
 489     case AMDGPUIntrinsic::AMDGPU_store_output: {
 490       MachineFunction &MF = DAG.getMachineFunction();
 491       R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
 492       int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
 493       unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister(RegIndex);
 494       MFI->LiveOuts.push_back(Reg);
 495       return DAG.getCopyToReg(Chain, SDLoc(Op), Reg, Op.getOperand(2));
 496     }
 497     case AMDGPUIntrinsic::R600_store_swizzle: {
 498       const SDValue Args[8] = {
 499         Chain,
 500         Op.getOperand(2), // Export Value
 501         Op.getOperand(3), // ArrayBase
 502         Op.getOperand(4), // Type
 503         DAG.getConstant(0, MVT::i32), // SWZ_X
 504         DAG.getConstant(1, MVT::i32), // SWZ_Y
 505         DAG.getConstant(2, MVT::i32), // SWZ_Z
 506         DAG.getConstant(3, MVT::i32) // SWZ_W
 507       };
 508       return DAG.getNode(AMDGPUISD::EXPORT, SDLoc(Op), Op.getValueType(),
 509           Args, 8);
 510     }
 511
 512     // default for switch(IntrinsicID)
 513     default: break;
 514     }
 515     // break out of case ISD::INTRINSIC_VOID in switch(Op.getOpcode())
 516     break;
 517   }
 518   case ISD::INTRINSIC_WO_CHAIN: {
 519     unsigned IntrinsicID =
 520                          cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
 521     EVT VT = Op.getValueType();
 522     SDLoc DL(Op);
 523     switch(IntrinsicID) {
 524     default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
 525     case AMDGPUIntrinsic::R600_load_input: {
 526       int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
 527       unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister(RegIndex);
 528       MachineFunction &MF = DAG.getMachineFunction();
 529       MachineRegisterInfo &MRI = MF.getRegInfo();
 530       MRI.addLiveIn(Reg);
 531       return DAG.getCopyFromReg(DAG.getEntryNode(),
 532           SDLoc(DAG.getEntryNode()), Reg, VT);
 533     }
 534
 535     case AMDGPUIntrinsic::R600_interp_input: {
 536       int slot = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
 537       int ijb = cast<ConstantSDNode>(Op.getOperand(2))->getSExtValue();
 538       MachineSDNode *interp;
 539       if (ijb < 0) {
 540         const MachineFunction &MF = DAG.getMachineFunction();
 541         const R600InstrInfo *TII =
 542           static_cast<const R600InstrInfo*>(MF.getTarget().getInstrInfo());
 543         interp = DAG.getMachineNode(AMDGPU::INTERP_VEC_LOAD, DL,
 544             MVT::v4f32, DAG.getTargetConstant(slot / 4 , MVT::i32));
 545         return DAG.getTargetExtractSubreg(
 546             TII->getRegisterInfo().getSubRegFromChannel(slot % 4),
 547             DL, MVT::f32, SDValue(interp, 0));
 548       }
 549
 550       MachineFunction &MF = DAG.getMachineFunction();
 551       MachineRegisterInfo &MRI = MF.getRegInfo();
 552       unsigned RegisterI = AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb);
 553       unsigned RegisterJ = AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb + 1);
 554       MRI.addLiveIn(RegisterI);
 555       MRI.addLiveIn(RegisterJ);
 556       SDValue RegisterINode = DAG.getCopyFromReg(DAG.getEntryNode(),
 557           SDLoc(DAG.getEntryNode()), RegisterI, MVT::f32);
 558       SDValue RegisterJNode = DAG.getCopyFromReg(DAG.getEntryNode(),
 559           SDLoc(DAG.getEntryNode()), RegisterJ, MVT::f32);
 560
 561       if (slot % 4 < 2)
 562         interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_XY, DL,
 563             MVT::f32, MVT::f32, DAG.getTargetConstant(slot / 4 , MVT::i32),
 564             RegisterJNode, RegisterINode);
 565       else
 566         interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_ZW, DL,
 567             MVT::f32, MVT::f32, DAG.getTargetConstant(slot / 4 , MVT::i32),
 568             RegisterJNode, RegisterINode);
 569       return SDValue(interp, slot % 2);
 570     }
 571     case AMDGPUIntrinsic::R600_tex:
 572     case AMDGPUIntrinsic::R600_texc:
 573     case AMDGPUIntrinsic::R600_txl:
 574     case AMDGPUIntrinsic::R600_txlc:
 575     case AMDGPUIntrinsic::R600_txb:
 576     case AMDGPUIntrinsic::R600_txbc:
 577     case AMDGPUIntrinsic::R600_txf:
 578     case AMDGPUIntrinsic::R600_txq:
 579     case AMDGPUIntrinsic::R600_ddx:
 580     case AMDGPUIntrinsic::R600_ddy: {
 581       unsigned TextureOp;
 582       switch (IntrinsicID) {
 583       case AMDGPUIntrinsic::R600_tex:
 584         TextureOp = 0;
 585         break;
 586       case AMDGPUIntrinsic::R600_texc:
 587         TextureOp = 1;
 588         break;
 589       case AMDGPUIntrinsic::R600_txl:
 590         TextureOp = 2;
 591         break;
 592       case AMDGPUIntrinsic::R600_txlc:
 593         TextureOp = 3;
 594         break;
 595       case AMDGPUIntrinsic::R600_txb:
 596         TextureOp = 4;
 597         break;
 598       case AMDGPUIntrinsic::R600_txbc:
 599         TextureOp = 5;
 600         break;
 601       case AMDGPUIntrinsic::R600_txf:
 602         TextureOp = 6;
 603         break;
 604       case AMDGPUIntrinsic::R600_txq:
 605         TextureOp = 7;
 606         break;
 607       case AMDGPUIntrinsic::R600_ddx:
 608         TextureOp = 8;
 609         break;
 610       case AMDGPUIntrinsic::R600_ddy:
 611         TextureOp = 9;
 612         break;
 613       default:
 614         llvm_unreachable("Unknow Texture Operation");
 615       }
 616
 617       SDValue TexArgs[19] = {
 618         DAG.getConstant(TextureOp, MVT::i32),
 619         Op.getOperand(1),
 620         DAG.getConstant(0, MVT::i32),
 621         DAG.getConstant(1, MVT::i32),
 622         DAG.getConstant(2, MVT::i32),
 623         DAG.getConstant(3, MVT::i32),
 624         Op.getOperand(2),
 625         Op.getOperand(3),
 626         Op.getOperand(4),
 627         DAG.getConstant(0, MVT::i32),
 628         DAG.getConstant(1, MVT::i32),
 629         DAG.getConstant(2, MVT::i32),
 630         DAG.getConstant(3, MVT::i32),
 631         Op.getOperand(5),
 632         Op.getOperand(6),
 633         Op.getOperand(7),
 634         Op.getOperand(8),
 635         Op.getOperand(9),
 636         Op.getOperand(10)
 637       };
 638       return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, DL, MVT::v4f32, TexArgs, 19);
 639     }
 640     case AMDGPUIntrinsic::AMDGPU_dp4: {
 641       SDValue Args[8] = {
 642       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
 643           DAG.getConstant(0, MVT::i32)),
 644       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
 645           DAG.getConstant(0, MVT::i32)),
 646       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
 647           DAG.getConstant(1, MVT::i32)),
 648       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
 649           DAG.getConstant(1, MVT::i32)),
 650       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
 651           DAG.getConstant(2, MVT::i32)),
 652       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
 653           DAG.getConstant(2, MVT::i32)),
 654       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
 655           DAG.getConstant(3, MVT::i32)),
 656       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
 657           DAG.getConstant(3, MVT::i32))
 658       };
 659       return DAG.getNode(AMDGPUISD::DOT4, DL, MVT::f32, Args, 8);
 660     }
 661
 662     case Intrinsic::r600_read_ngroups_x:
 663       return LowerImplicitParameter(DAG, VT, DL, 0);
 664     case Intrinsic::r600_read_ngroups_y:
 665       return LowerImplicitParameter(DAG, VT, DL, 1);
 666     case Intrinsic::r600_read_ngroups_z:
 667       return LowerImplicitParameter(DAG, VT, DL, 2);
 668     case Intrinsic::r600_read_global_size_x:
 669       return LowerImplicitParameter(DAG, VT, DL, 3);
 670     case Intrinsic::r600_read_global_size_y:
 671       return LowerImplicitParameter(DAG, VT, DL, 4);
 672     case Intrinsic::r600_read_global_size_z:
 673       return LowerImplicitParameter(DAG, VT, DL, 5);
 674     case Intrinsic::r600_read_local_size_x:
 675       return LowerImplicitParameter(DAG, VT, DL, 6);
 676     case Intrinsic::r600_read_local_size_y:
 677       return LowerImplicitParameter(DAG, VT, DL, 7);
 678     case Intrinsic::r600_read_local_size_z:
 679       return LowerImplicitParameter(DAG, VT, DL, 8);
 680
 681     case Intrinsic::r600_read_tgid_x:
 682       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 683                                   AMDGPU::T1_X, VT);
 684     case Intrinsic::r600_read_tgid_y:
 685       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 686                                   AMDGPU::T1_Y, VT);
 687     case Intrinsic::r600_read_tgid_z:
 688       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 689                                   AMDGPU::T1_Z, VT);
 690     case Intrinsic::r600_read_tidig_x:
 691       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 692                                   AMDGPU::T0_X, VT);
 693     case Intrinsic::r600_read_tidig_y:
 694       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 695                                   AMDGPU::T0_Y, VT);
 696     case Intrinsic::r600_read_tidig_z:
 697       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 698                                   AMDGPU::T0_Z, VT);
 699     }
 700     // break out of case ISD::INTRINSIC_WO_CHAIN in switch(Op.getOpcode())
 701     break;
 702   }
 703   } // end switch(Op.getOpcode())
 704   return SDValue();
 705 }
 706
 707 void R600TargetLowering::ReplaceNodeResults(SDNode *N,
 708                                             SmallVectorImpl<SDValue> &Results,
 709                                             SelectionDAG &DAG) const {
 710   switch (N->getOpcode()) {
 711   default: return;
 712   case ISD::FP_TO_UINT: Results.push_back(LowerFPTOUINT(N->getOperand(0), DAG));
 713     return;
 714   case ISD::LOAD: {
 715     SDNode *Node = LowerLOAD(SDValue(N, 0), DAG).getNode();
 716     Results.push_back(SDValue(Node, 0));
 717     Results.push_back(SDValue(Node, 1));
 718     // XXX: LLVM seems not to replace Chain Value inside CustomWidenLowerNode
 719     // function
 720     DAG.ReplaceAllUsesOfValueWith(SDValue(N,1), SDValue(Node, 1));
 721     return;
 722   }
 723   case ISD::STORE:
 724     SDNode *Node = LowerSTORE(SDValue(N, 0), DAG).getNode();
 725     Results.push_back(SDValue(Node, 0));
 726     return;
 727   }
 728 }
 729
 730 SDValue R600TargetLowering::LowerFPTOUINT(SDValue Op, SelectionDAG &DAG) const {
 731   return DAG.getNode(
 732       ISD::SETCC,
 733       SDLoc(Op),
 734       MVT::i1,
 735       Op, DAG.getConstantFP(0.0f, MVT::f32),
 736       DAG.getCondCode(ISD::SETNE)
 737       );
 738 }
 739
 740 SDValue R600TargetLowering::LowerImplicitParameter(SelectionDAG &DAG, EVT VT,
 741                                                    SDLoc DL,
 742                                                    unsigned DwordOffset) const {
 743   unsigned ByteOffset = DwordOffset * 4;
 744   PointerType * PtrType = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
 745                                       AMDGPUAS::PARAM_I_ADDRESS);
 746
 747   // We shouldn't be using an offset wider than 16-bits for implicit parameters.
 748   assert(isInt<16>(ByteOffset));
 749
 750   return DAG.getLoad(VT, DL, DAG.getEntryNode(),
 751                      DAG.getConstant(ByteOffset, MVT::i32), // PTR
 752                      MachinePointerInfo(ConstantPointerNull::get(PtrType)),
 753                      false, false, false, 0);
 754 }
 755
 756 SDValue R600TargetLowering::LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const {
 757
 758   MachineFunction &MF = DAG.getMachineFunction();
 759   const AMDGPUFrameLowering *TFL =
 760    static_cast<const AMDGPUFrameLowering*>(getTargetMachine().getFrameLowering());
 761
 762   FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Op);
 763   assert(FIN);
 764
 765   unsigned FrameIndex = FIN->getIndex();
 766   unsigned Offset = TFL->getFrameIndexOffset(MF, FrameIndex);
 767   return DAG.getConstant(Offset * 4 * TFL->getStackWidth(MF), MVT::i32);
 768 }
 769
 770 bool R600TargetLowering::isZero(SDValue Op) const {
 771   if(ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
 772     return Cst->isNullValue();
 773   } else if(ConstantFPSDNode *CstFP = dyn_cast<ConstantFPSDNode>(Op)){
 774     return CstFP->isZero();
 775   } else {
 776     return false;
 777   }
 778 }
 779
 780 SDValue R600TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
 781   SDLoc DL(Op);
 782   EVT VT = Op.getValueType();
 783
 784   SDValue LHS = Op.getOperand(0);
 785   SDValue RHS = Op.getOperand(1);
 786   SDValue True = Op.getOperand(2);
 787   SDValue False = Op.getOperand(3);
 788   SDValue CC = Op.getOperand(4);
 789   SDValue Temp;
 790
 791   // LHS and RHS are guaranteed to be the same value type
 792   EVT CompareVT = LHS.getValueType();
 793
 794   // Check if we can lower this to a native operation.
 795
 796   // Try to lower to a SET* instruction:
 797   //
 798   // SET* can match the following patterns:
 799   //
 800   // select_cc f32, f32, -1,  0, cc_any
 801   // select_cc f32, f32, 1.0f, 0.0f, cc_any
 802   // select_cc i32, i32, -1,  0, cc_any
 803   //
 804
 805   // Move hardware True/False values to the correct operand.
 806   if (isHWTrueValue(False) && isHWFalseValue(True)) {
 807     ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
 808     std::swap(False, True);
 809     CC = DAG.getCondCode(ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32));
 810   }
 811
 812   if (isHWTrueValue(True) && isHWFalseValue(False) &&
 813       (CompareVT == VT || VT == MVT::i32)) {
 814     // This can be matched by a SET* instruction.
 815     return DAG.getNode(ISD::SELECT_CC, DL, VT, LHS, RHS, True, False, CC);
 816   }
 817
 818   // Try to lower to a CND* instruction:
 819   //
 820   // CND* can match the following patterns:
 821   //
 822   // select_cc f32, 0.0, f32, f32, cc_any
 823   // select_cc f32, 0.0, i32, i32, cc_any
 824   // select_cc i32, 0,   f32, f32, cc_any
 825   // select_cc i32, 0,   i32, i32, cc_any
 826   //
 827   if (isZero(LHS) || isZero(RHS)) {
 828     SDValue Cond = (isZero(LHS) ? RHS : LHS);
 829     SDValue Zero = (isZero(LHS) ? LHS : RHS);
 830     ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
 831     if (CompareVT != VT) {
 832       // Bitcast True / False to the correct types.  This will end up being
 833       // a nop, but it allows us to define only a single pattern in the
 834       // .TD files for each CND* instruction rather than having to have
 835       // one pattern for integer True/False and one for fp True/False
 836       True = DAG.getNode(ISD::BITCAST, DL, CompareVT, True);
 837       False = DAG.getNode(ISD::BITCAST, DL, CompareVT, False);
 838     }
 839     if (isZero(LHS)) {
 840       CCOpcode = ISD::getSetCCSwappedOperands(CCOpcode);
 841     }
 842
 843     switch (CCOpcode) {
 844     case ISD::SETONE:
 845     case ISD::SETUNE:
 846     case ISD::SETNE:
 847     case ISD::SETULE:
 848     case ISD::SETULT:
 849     case ISD::SETOLE:
 850     case ISD::SETOLT:
 851     case ISD::SETLE:
 852     case ISD::SETLT:
 853       CCOpcode = ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32);
 854       Temp = True;
 855       True = False;
 856       False = Temp;
 857       break;
 858     default:
 859       break;
 860     }
 861     SDValue SelectNode = DAG.getNode(ISD::SELECT_CC, DL, CompareVT,
 862         Cond, Zero,
 863         True, False,
 864         DAG.getCondCode(CCOpcode));
 865     return DAG.getNode(ISD::BITCAST, DL, VT, SelectNode);
 866   }
 867
 868
 869   // Possible Min/Max pattern
 870   SDValue MinMax = LowerMinMax(Op, DAG);
 871   if (MinMax.getNode()) {
 872     return MinMax;
 873   }
 874
 875   // If we make it this for it means we have no native instructions to handle
 876   // this SELECT_CC, so we must lower it.
 877   SDValue HWTrue, HWFalse;
 878
 879   if (CompareVT == MVT::f32) {
 880     HWTrue = DAG.getConstantFP(1.0f, CompareVT);
 881     HWFalse = DAG.getConstantFP(0.0f, CompareVT);
 882   } else if (CompareVT == MVT::i32) {
 883     HWTrue = DAG.getConstant(-1, CompareVT);
 884     HWFalse = DAG.getConstant(0, CompareVT);
 885   }
 886   else {
 887     assert(!"Unhandled value type in LowerSELECT_CC");
 888   }
 889
 890   // Lower this unsupported SELECT_CC into a combination of two supported
 891   // SELECT_CC operations.
 892   SDValue Cond = DAG.getNode(ISD::SELECT_CC, DL, CompareVT, LHS, RHS, HWTrue, HWFalse, CC);
 893
 894   return DAG.getNode(ISD::SELECT_CC, DL, VT,
 895       Cond, HWFalse,
 896       True, False,
 897       DAG.getCondCode(ISD::SETNE));
 898 }
 899
 900 SDValue R600TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
 901   return DAG.getNode(ISD::SELECT_CC,
 902       SDLoc(Op),
 903       Op.getValueType(),
 904       Op.getOperand(0),
 905       DAG.getConstant(0, MVT::i32),
 906       Op.getOperand(1),
 907       Op.getOperand(2),
 908       DAG.getCondCode(ISD::SETNE));
 909 }
 910
 911 /// LLVM generates byte-addresed pointers.  For indirect addressing, we need to
 912 /// convert these pointers to a register index.  Each register holds
 913 /// 16 bytes, (4 x 32bit sub-register), but we need to take into account the
 914 /// \p StackWidth, which tells us how many of the 4 sub-registrers will be used
 915 /// for indirect addressing.
 916 SDValue R600TargetLowering::stackPtrToRegIndex(SDValue Ptr,
 917                                                unsigned StackWidth,
 918                                                SelectionDAG &DAG) const {
 919   unsigned SRLPad;
 920   switch(StackWidth) {
 921   case 1:
 922     SRLPad = 2;
 923     break;
 924   case 2:
 925     SRLPad = 3;
 926     break;
 927   case 4:
 928     SRLPad = 4;
 929     break;
 930   default: llvm_unreachable("Invalid stack width");
 931   }
 932
 933   return DAG.getNode(ISD::SRL, SDLoc(Ptr), Ptr.getValueType(), Ptr,
 934                      DAG.getConstant(SRLPad, MVT::i32));
 935 }
 936
 937 void R600TargetLowering::getStackAddress(unsigned StackWidth,
 938                                          unsigned ElemIdx,
 939                                          unsigned &Channel,
 940                                          unsigned &PtrIncr) const {
 941   switch (StackWidth) {
 942   default:
 943   case 1:
 944     Channel = 0;
 945     if (ElemIdx > 0) {
 946       PtrIncr = 1;
 947     } else {
 948       PtrIncr = 0;
 949     }
 950     break;
 951   case 2:
 952     Channel = ElemIdx % 2;
 953     if (ElemIdx == 2) {
 954       PtrIncr = 1;
 955     } else {
 956       PtrIncr = 0;
 957     }
 958     break;
 959   case 4:
 960     Channel = ElemIdx;
 961     PtrIncr = 0;
 962     break;
 963   }
 964 }
 965
 966 SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
 967   SDLoc DL(Op);
 968   StoreSDNode *StoreNode = cast<StoreSDNode>(Op);
 969   SDValue Chain = Op.getOperand(0);
 970   SDValue Value = Op.getOperand(1);
 971   SDValue Ptr = Op.getOperand(2);
 972
 973   if (StoreNode->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS &&
 974       Ptr->getOpcode() != AMDGPUISD::DWORDADDR) {
 975     // Convert pointer from byte address to dword address.
 976     Ptr = DAG.getNode(AMDGPUISD::DWORDADDR, DL, Ptr.getValueType(),
 977                       DAG.getNode(ISD::SRL, DL, Ptr.getValueType(),
 978                                   Ptr, DAG.getConstant(2, MVT::i32)));
 979
 980     if (StoreNode->isTruncatingStore() || StoreNode->isIndexed()) {
 981       assert(!"Truncated and indexed stores not supported yet");
 982     } else {
 983       Chain = DAG.getStore(Chain, DL, Value, Ptr, StoreNode->getMemOperand());
 984     }
 985     return Chain;
 986   }
 987
 988   EVT ValueVT = Value.getValueType();
 989
 990   if (StoreNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) {
 991     return SDValue();
 992   }
 993
 994   // Lowering for indirect addressing
 995
 996   const MachineFunction &MF = DAG.getMachineFunction();
 997   const AMDGPUFrameLowering *TFL = static_cast<const AMDGPUFrameLowering*>(
 998                                          getTargetMachine().getFrameLowering());
 999   unsigned StackWidth = TFL->getStackWidth(MF);
1000
1001   Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG);
1002
1003   if (ValueVT.isVector()) {
1004     unsigned NumElemVT = ValueVT.getVectorNumElements();
1005     EVT ElemVT = ValueVT.getVectorElementType();
1006     SDValue Stores[4];
1007
1008     assert(NumElemVT >= StackWidth && "Stack width cannot be greater than "
1009                                       "vector width in load");
1010
1011     for (unsigned i = 0; i < NumElemVT; ++i) {
1012       unsigned Channel, PtrIncr;
1013       getStackAddress(StackWidth, i, Channel, PtrIncr);
1014       Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr,
1015                         DAG.getConstant(PtrIncr, MVT::i32));
1016       SDValue Elem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ElemVT,
1017                                  Value, DAG.getConstant(i, MVT::i32));
1018
1019       Stores[i] = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other,
1020                               Chain, Elem, Ptr,
1021                               DAG.getTargetConstant(Channel, MVT::i32));
1022     }
1023      Chain =  DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores, NumElemVT);
1024    } else {
1025     if (ValueVT == MVT::i8) {
1026       Value = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Value);
1027     }
1028     Chain = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other, Chain, Value, Ptr,
1029     DAG.getTargetConstant(0, MVT::i32)); // Channel
1030   }
1031
1032   return Chain;
1033 }
1034
1035 // return (512 + (kc_bank << 12)
1036 static int
1037 ConstantAddressBlock(unsigned AddressSpace) {
1038   switch (AddressSpace) {
1039   case AMDGPUAS::CONSTANT_BUFFER_0:
1040     return 512;
1041   case AMDGPUAS::CONSTANT_BUFFER_1:
1042     return 512 + 4096;
1043   case AMDGPUAS::CONSTANT_BUFFER_2:
1044     return 512 + 4096 * 2;
1045   case AMDGPUAS::CONSTANT_BUFFER_3:
1046     return 512 + 4096 * 3;
1047   case AMDGPUAS::CONSTANT_BUFFER_4:
1048     return 512 + 4096 * 4;
1049   case AMDGPUAS::CONSTANT_BUFFER_5:
1050     return 512 + 4096 * 5;
1051   case AMDGPUAS::CONSTANT_BUFFER_6:
1052     return 512 + 4096 * 6;
1053   case AMDGPUAS::CONSTANT_BUFFER_7:
1054     return 512 + 4096 * 7;
1055   case AMDGPUAS::CONSTANT_BUFFER_8:
1056     return 512 + 4096 * 8;
1057   case AMDGPUAS::CONSTANT_BUFFER_9:
1058     return 512 + 4096 * 9;
1059   case AMDGPUAS::CONSTANT_BUFFER_10:
1060     return 512 + 4096 * 10;
1061   case AMDGPUAS::CONSTANT_BUFFER_11:
1062     return 512 + 4096 * 11;
1063   case AMDGPUAS::CONSTANT_BUFFER_12:
1064     return 512 + 4096 * 12;
1065   case AMDGPUAS::CONSTANT_BUFFER_13:
1066     return 512 + 4096 * 13;
1067   case AMDGPUAS::CONSTANT_BUFFER_14:
1068     return 512 + 4096 * 14;
1069   case AMDGPUAS::CONSTANT_BUFFER_15:
1070     return 512 + 4096 * 15;
1071   default:
1072     return -1;
1073   }
1074 }
1075
1076 SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const
1077 {
1078   EVT VT = Op.getValueType();
1079   SDLoc DL(Op);
1080   LoadSDNode *LoadNode = cast<LoadSDNode>(Op);
1081   SDValue Chain = Op.getOperand(0);
1082   SDValue Ptr = Op.getOperand(1);
1083   SDValue LoweredLoad;
1084
1085   int ConstantBlock = ConstantAddressBlock(LoadNode->getAddressSpace());
1086   if (ConstantBlock > -1) {
1087     SDValue Result;
1088     if (dyn_cast<ConstantExpr>(LoadNode->getSrcValue()) ||
1089         dyn_cast<Constant>(LoadNode->getSrcValue()) ||
1090         dyn_cast<ConstantSDNode>(Ptr)) {
1091       SDValue Slots[4];
1092       for (unsigned i = 0; i < 4; i++) {
1093         // We want Const position encoded with the following formula :
1094         // (((512 + (kc_bank << 12) + const_index) << 2) + chan)
1095         // const_index is Ptr computed by llvm using an alignment of 16.
1096         // Thus we add (((512 + (kc_bank << 12)) + chan ) * 4 here and
1097         // then div by 4 at the ISel step
1098         SDValue NewPtr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
1099             DAG.getConstant(4 * i + ConstantBlock * 16, MVT::i32));
1100         Slots[i] = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::i32, NewPtr);
1101       }
1102       Result = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v4i32, Slots, 4);
1103     } else {
1104       // non constant ptr cant be folded, keeps it as a v4f32 load
1105       Result = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::v4i32,
1106           DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr, DAG.getConstant(4, MVT::i32)),
1107           DAG.getConstant(LoadNode->getAddressSpace() -
1108                           AMDGPUAS::CONSTANT_BUFFER_0, MVT::i32)
1109           );
1110     }
1111
1112     if (!VT.isVector()) {
1113       Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Result,
1114           DAG.getConstant(0, MVT::i32));
1115     }
1116
1117     SDValue MergedValues[2] = {
1118         Result,
1119         Chain
1120     };
1121     return DAG.getMergeValues(MergedValues, 2, DL);
1122   }
1123
1124   if (LoadNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) {
1125     return SDValue();
1126   }
1127
1128   // Lowering for indirect addressing
1129   const MachineFunction &MF = DAG.getMachineFunction();
1130   const AMDGPUFrameLowering *TFL = static_cast<const AMDGPUFrameLowering*>(
1131                                          getTargetMachine().getFrameLowering());
1132   unsigned StackWidth = TFL->getStackWidth(MF);
1133
1134   Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG);
1135
1136   if (VT.isVector()) {
1137     unsigned NumElemVT = VT.getVectorNumElements();
1138     EVT ElemVT = VT.getVectorElementType();
1139     SDValue Loads[4];
1140
1141     assert(NumElemVT >= StackWidth && "Stack width cannot be greater than "
1142                                       "vector width in load");
1143
1144     for (unsigned i = 0; i < NumElemVT; ++i) {
1145       unsigned Channel, PtrIncr;
1146       getStackAddress(StackWidth, i, Channel, PtrIncr);
1147       Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr,
1148                         DAG.getConstant(PtrIncr, MVT::i32));
1149       Loads[i] = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, ElemVT,
1150                              Chain, Ptr,
1151                              DAG.getTargetConstant(Channel, MVT::i32),
1152                              Op.getOperand(2));
1153     }
1154     for (unsigned i = NumElemVT; i < 4; ++i) {
1155       Loads[i] = DAG.getUNDEF(ElemVT);
1156     }
1157     EVT TargetVT = EVT::getVectorVT(*DAG.getContext(), ElemVT, 4);
1158     LoweredLoad = DAG.getNode(ISD::BUILD_VECTOR, DL, TargetVT, Loads, 4);
1159   } else {
1160     LoweredLoad = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, VT,
1161                               Chain, Ptr,
1162                               DAG.getTargetConstant(0, MVT::i32), // Channel
1163                               Op.getOperand(2));
1164   }
1165
1166   SDValue Ops[2];
1167   Ops[0] = LoweredLoad;
1168   Ops[1] = Chain;
1169
1170   return DAG.getMergeValues(Ops, 2, DL);
1171 }
1172
1173 /// XXX Only kernel functions are supported, so we can assume for now that
1174 /// every function is a kernel function, but in the future we should use
1175 /// separate calling conventions for kernel and non-kernel functions.
1176 SDValue R600TargetLowering::LowerFormalArguments(
1177                                       SDValue Chain,
1178                                       CallingConv::ID CallConv,
1179                                       bool isVarArg,
1180                                       const SmallVectorImpl<ISD::InputArg> &Ins,
1181                                       SDLoc DL, SelectionDAG &DAG,
1182                                       SmallVectorImpl<SDValue> &InVals) const {
1183   unsigned ParamOffsetBytes = 36;
1184   Function::const_arg_iterator FuncArg =
1185                             DAG.getMachineFunction().getFunction()->arg_begin();
1186   for (unsigned i = 0, e = Ins.size(); i < e; ++i, ++FuncArg) {
1187     EVT VT = Ins[i].VT;
1188     Type *ArgType = FuncArg->getType();
1189     unsigned ArgSizeInBits = ArgType->isPointerTy() ?
1190                              32 : ArgType->getPrimitiveSizeInBits();
1191     unsigned ArgBytes = ArgSizeInBits >> 3;
1192     EVT ArgVT;
1193     if (ArgSizeInBits < VT.getSizeInBits()) {
1194       assert(!ArgType->isFloatTy() &&
1195              "Extending floating point arguments not supported yet");
1196       ArgVT = MVT::getIntegerVT(ArgSizeInBits);
1197     } else {
1198       ArgVT = VT;
1199     }
1200     PointerType *PtrTy = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
1201                                                     AMDGPUAS::PARAM_I_ADDRESS);
1202     SDValue Arg = DAG.getExtLoad(ISD::ZEXTLOAD, DL, VT, DAG.getRoot(),
1203                                 DAG.getConstant(ParamOffsetBytes, MVT::i32),
1204                                        MachinePointerInfo(UndefValue::get(PtrTy)),
1205                                        ArgVT, false, false, ArgBytes);
1206     InVals.push_back(Arg);
1207     ParamOffsetBytes += ArgBytes;
1208   }
1209   return Chain;
1210 }
1211
1212 EVT R600TargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const {
1213    if (!VT.isVector()) return MVT::i32;
1214    return VT.changeVectorElementTypeToInteger();
1215 }
1216
1217 static SDValue
1218 CompactSwizzlableVector(SelectionDAG &DAG, SDValue VectorEntry,
1219                         DenseMap<unsigned, unsigned> &RemapSwizzle) {
1220   assert(VectorEntry.getOpcode() == ISD::BUILD_VECTOR);
1221   assert(RemapSwizzle.empty());
1222   SDValue NewBldVec[4] = {
1223       VectorEntry.getOperand(0),
1224       VectorEntry.getOperand(1),
1225       VectorEntry.getOperand(2),
1226       VectorEntry.getOperand(3)
1227   };
1228
1229   for (unsigned i = 0; i < 4; i++) {
1230     if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(NewBldVec[i])) {
1231       if (C->isZero()) {
1232         RemapSwizzle[i] = 4; // SEL_0
1233         NewBldVec[i] = DAG.getUNDEF(MVT::f32);
1234       } else if (C->isExactlyValue(1.0)) {
1235         RemapSwizzle[i] = 5; // SEL_1
1236         NewBldVec[i] = DAG.getUNDEF(MVT::f32);
1237       }
1238     }
1239
1240     if (NewBldVec[i].getOpcode() == ISD::UNDEF)
1241       continue;
1242     for (unsigned j = 0; j < i; j++) {
1243       if (NewBldVec[i] == NewBldVec[j]) {
1244         NewBldVec[i] = DAG.getUNDEF(NewBldVec[i].getValueType());
1245         RemapSwizzle[i] = j;
1246         break;
1247       }
1248     }
1249   }
1250
1251   return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(VectorEntry),
1252       VectorEntry.getValueType(), NewBldVec, 4);
1253 }
1254
1255 static SDValue ReorganizeVector(SelectionDAG &DAG, SDValue VectorEntry,
1256                                 DenseMap<unsigned, unsigned> &RemapSwizzle) {
1257   assert(VectorEntry.getOpcode() == ISD::BUILD_VECTOR);
1258   assert(RemapSwizzle.empty());
1259   SDValue NewBldVec[4] = {
1260       VectorEntry.getOperand(0),
1261       VectorEntry.getOperand(1),
1262       VectorEntry.getOperand(2),
1263       VectorEntry.getOperand(3)
1264   };
1265   bool isUnmovable[4] = { false, false, false, false };
1266
1267   for (unsigned i = 0; i < 4; i++) {
1268     if (NewBldVec[i].getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
1269       unsigned Idx = dyn_cast<ConstantSDNode>(NewBldVec[i].getOperand(1))
1270           ->getZExtValue();
1271       if (!isUnmovable[Idx]) {
1272         // Swap i and Idx
1273         std::swap(NewBldVec[Idx], NewBldVec[i]);
1274         RemapSwizzle[Idx] = i;
1275         RemapSwizzle[i] = Idx;
1276       }
1277       isUnmovable[Idx] = true;
1278     }
1279   }
1280
1281   return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(VectorEntry),
1282       VectorEntry.getValueType(), NewBldVec, 4);
1283 }
1284
1285
1286 SDValue R600TargetLowering::OptimizeSwizzle(SDValue BuildVector,
1287 SDValue Swz[4], SelectionDAG &DAG) const {
1288   assert(BuildVector.getOpcode() == ISD::BUILD_VECTOR);
1289   // Old -> New swizzle values
1290   DenseMap<unsigned, unsigned> SwizzleRemap;
1291
1292   BuildVector = CompactSwizzlableVector(DAG, BuildVector, SwizzleRemap);
1293   for (unsigned i = 0; i < 4; i++) {
1294     unsigned Idx = dyn_cast<ConstantSDNode>(Swz[i])->getZExtValue();
1295     if (SwizzleRemap.find(Idx) != SwizzleRemap.end())
1296       Swz[i] = DAG.getConstant(SwizzleRemap[Idx], MVT::i32);
1297   }
1298
1299   SwizzleRemap.clear();
1300   BuildVector = ReorganizeVector(DAG, BuildVector, SwizzleRemap);
1301   for (unsigned i = 0; i < 4; i++) {
1302     unsigned Idx = dyn_cast<ConstantSDNode>(Swz[i])->getZExtValue();
1303     if (SwizzleRemap.find(Idx) != SwizzleRemap.end())
1304       Swz[i] = DAG.getConstant(SwizzleRemap[Idx], MVT::i32);
1305   }
1306
1307   return BuildVector;
1308 }
1309
1310
1311 //===----------------------------------------------------------------------===//
1312 // Custom DAG Optimizations
1313 //===----------------------------------------------------------------------===//
1314
1315 SDValue R600TargetLowering::PerformDAGCombine(SDNode *N,
1316                                               DAGCombinerInfo &DCI) const {
1317   SelectionDAG &DAG = DCI.DAG;
1318
1319   switch (N->getOpcode()) {
1320   // (f32 fp_round (f64 uint_to_fp a)) -> (f32 uint_to_fp a)
1321   case ISD::FP_ROUND: {
1322       SDValue Arg = N->getOperand(0);
1323       if (Arg.getOpcode() == ISD::UINT_TO_FP && Arg.getValueType() == MVT::f64) {
1324         return DAG.getNode(ISD::UINT_TO_FP, SDLoc(N), N->getValueType(0),
1325                            Arg.getOperand(0));
1326       }
1327       break;
1328     }
1329
1330   // (i32 fp_to_sint (fneg (select_cc f32, f32, 1.0, 0.0 cc))) ->
1331   // (i32 select_cc f32, f32, -1, 0 cc)
1332   //
1333   // Mesa's GLSL frontend generates the above pattern a lot and we can lower
1334   // this to one of the SET*_DX10 instructions.
1335   case ISD::FP_TO_SINT: {
1336     SDValue FNeg = N->getOperand(0);
1337     if (FNeg.getOpcode() != ISD::FNEG) {
1338       return SDValue();
1339     }
1340     SDValue SelectCC = FNeg.getOperand(0);
1341     if (SelectCC.getOpcode() != ISD::SELECT_CC ||
1342         SelectCC.getOperand(0).getValueType() != MVT::f32 || // LHS
1343         SelectCC.getOperand(2).getValueType() != MVT::f32 || // True
1344         !isHWTrueValue(SelectCC.getOperand(2)) ||
1345         !isHWFalseValue(SelectCC.getOperand(3))) {
1346       return SDValue();
1347     }
1348
1349     return DAG.getNode(ISD::SELECT_CC, SDLoc(N), N->getValueType(0),
1350                            SelectCC.getOperand(0), // LHS
1351                            SelectCC.getOperand(1), // RHS
1352                            DAG.getConstant(-1, MVT::i32), // True
1353                            DAG.getConstant(0, MVT::i32),  // Flase
1354                            SelectCC.getOperand(4)); // CC
1355
1356     break;
1357   }
1358   // Extract_vec (Build_vector) generated by custom lowering
1359   // also needs to be customly combined
1360   case ISD::EXTRACT_VECTOR_ELT: {
1361     SDValue Arg = N->getOperand(0);
1362     if (Arg.getOpcode() == ISD::BUILD_VECTOR) {
1363       if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
1364         unsigned Element = Const->getZExtValue();
1365         return Arg->getOperand(Element);
1366       }
1367     }
1368     if (Arg.getOpcode() == ISD::BITCAST &&
1369         Arg.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
1370       if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
1371         unsigned Element = Const->getZExtValue();
1372         return DAG.getNode(ISD::BITCAST, SDLoc(N), N->getVTList(),
1373             Arg->getOperand(0).getOperand(Element));
1374       }
1375     }
1376   }
1377
1378   case ISD::SELECT_CC: {
1379     // fold selectcc (selectcc x, y, a, b, cc), b, a, b, seteq ->
1380     //      selectcc x, y, a, b, inv(cc)
1381     //
1382     // fold selectcc (selectcc x, y, a, b, cc), b, a, b, setne ->
1383     //      selectcc x, y, a, b, cc
1384     SDValue LHS = N->getOperand(0);
1385     if (LHS.getOpcode() != ISD::SELECT_CC) {
1386       return SDValue();
1387     }
1388
1389     SDValue RHS = N->getOperand(1);
1390     SDValue True = N->getOperand(2);
1391     SDValue False = N->getOperand(3);
1392     ISD::CondCode NCC = cast<CondCodeSDNode>(N->getOperand(4))->get();
1393
1394     if (LHS.getOperand(2).getNode() != True.getNode() ||
1395         LHS.getOperand(3).getNode() != False.getNode() ||
1396         RHS.getNode() != False.getNode()) {
1397       return SDValue();
1398     }
1399
1400     switch (NCC) {
1401     default: return SDValue();
1402     case ISD::SETNE: return LHS;
1403     case ISD::SETEQ: {
1404       ISD::CondCode LHSCC = cast<CondCodeSDNode>(LHS.getOperand(4))->get();
1405       LHSCC = ISD::getSetCCInverse(LHSCC,
1406                                   LHS.getOperand(0).getValueType().isInteger());
1407       return DAG.getSelectCC(SDLoc(N),
1408                              LHS.getOperand(0),
1409                              LHS.getOperand(1),
1410                              LHS.getOperand(2),
1411                              LHS.getOperand(3),
1412                              LHSCC);
1413     }
1414     }
1415   }
1416   case AMDGPUISD::EXPORT: {
1417     SDValue Arg = N->getOperand(1);
1418     if (Arg.getOpcode() != ISD::BUILD_VECTOR)
1419       break;
1420
1421     SDValue NewArgs[8] = {
1422       N->getOperand(0), // Chain
1423       SDValue(),
1424       N->getOperand(2), // ArrayBase
1425       N->getOperand(3), // Type
1426       N->getOperand(4), // SWZ_X
1427       N->getOperand(5), // SWZ_Y
1428       N->getOperand(6), // SWZ_Z
1429       N->getOperand(7) // SWZ_W
1430     };
1431     SDLoc DL(N);
1432     NewArgs[1] = OptimizeSwizzle(N->getOperand(1), &NewArgs[4], DAG);
1433     return DAG.getNode(AMDGPUISD::EXPORT, DL, N->getVTList(), NewArgs, 8);
1434   }
1435   case AMDGPUISD::TEXTURE_FETCH: {
1436     SDValue Arg = N->getOperand(1);
1437     if (Arg.getOpcode() != ISD::BUILD_VECTOR)
1438       break;
1439
1440     SDValue NewArgs[19] = {
1441       N->getOperand(0),
1442       N->getOperand(1),
1443       N->getOperand(2),
1444       N->getOperand(3),
1445       N->getOperand(4),
1446       N->getOperand(5),
1447       N->getOperand(6),
1448       N->getOperand(7),
1449       N->getOperand(8),
1450       N->getOperand(9),
1451       N->getOperand(10),
1452       N->getOperand(11),
1453       N->getOperand(12),
1454       N->getOperand(13),
1455       N->getOperand(14),
1456       N->getOperand(15),
1457       N->getOperand(16),
1458       N->getOperand(17),
1459       N->getOperand(18),
1460     };
1461     NewArgs[1] = OptimizeSwizzle(N->getOperand(1), &NewArgs[2], DAG);
1462     return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, SDLoc(N), N->getVTList(),
1463         NewArgs, 19);
1464   }
1465   }
1466   return SDValue();
1467 }