lib/Target/R600/R600ISelLowering.cpp

   1 //===-- R600ISelLowering.cpp - R600 DAG Lowering Implementation -----------===//
   2 //
   3 //                     The LLVM Compiler Infrastructure
   4 //
   5 // This file is distributed under the University of Illinois Open Source
   6 // License. See LICENSE.TXT for details.
   7 //
   8 //===----------------------------------------------------------------------===//
   9 //
  10 /// \file
  11 /// \brief Custom DAG lowering for R600
  12 //
  13 //===----------------------------------------------------------------------===//
  14
  15 #include "R600ISelLowering.h"
  16 #include "R600Defines.h"
  17 #include "R600InstrInfo.h"
  18 #include "R600MachineFunctionInfo.h"
  19 #include "llvm/CodeGen/MachineFrameInfo.h"
  20 #include "llvm/CodeGen/MachineInstrBuilder.h"
  21 #include "llvm/CodeGen/MachineRegisterInfo.h"
  22 #include "llvm/CodeGen/SelectionDAG.h"
  23 #include "llvm/IR/Argument.h"
  24 #include "llvm/IR/Function.h"
  25
  26 using namespace llvm;
  27
  28 R600TargetLowering::R600TargetLowering(TargetMachine &TM) :
  29     AMDGPUTargetLowering(TM),
  30     TII(static_cast<const R600InstrInfo*>(TM.getInstrInfo())) {
  31   addRegisterClass(MVT::v4f32, &AMDGPU::R600_Reg128RegClass);
  32   addRegisterClass(MVT::f32, &AMDGPU::R600_Reg32RegClass);
  33   addRegisterClass(MVT::v4i32, &AMDGPU::R600_Reg128RegClass);
  34   addRegisterClass(MVT::i32, &AMDGPU::R600_Reg32RegClass);
  35   computeRegisterProperties();
  36
  37   setOperationAction(ISD::FADD, MVT::v4f32, Expand);
  38   setOperationAction(ISD::FMUL, MVT::v4f32, Expand);
  39   setOperationAction(ISD::FDIV, MVT::v4f32, Expand);
  40   setOperationAction(ISD::FSUB, MVT::v4f32, Expand);
  41
  42   setOperationAction(ISD::ADD,  MVT::v4i32, Expand);
  43   setOperationAction(ISD::AND,  MVT::v4i32, Expand);
  44   setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Expand);
  45   setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Expand);
  46   setOperationAction(ISD::MUL,  MVT::v2i32, Expand);
  47   setOperationAction(ISD::MUL,  MVT::v4i32, Expand);
  48   setOperationAction(ISD::OR, MVT::v4i32, Expand);
  49   setOperationAction(ISD::OR, MVT::v2i32, Expand);
  50   setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Expand);
  51   setOperationAction(ISD::SHL, MVT::v4i32, Expand);
  52   setOperationAction(ISD::SHL, MVT::v2i32, Expand);
  53   setOperationAction(ISD::SRL, MVT::v4i32, Expand);
  54   setOperationAction(ISD::SRL, MVT::v2i32, Expand);
  55   setOperationAction(ISD::SRA, MVT::v4i32, Expand);
  56   setOperationAction(ISD::SRA, MVT::v2i32, Expand);
  57   setOperationAction(ISD::SUB, MVT::v4i32, Expand);
  58   setOperationAction(ISD::SUB, MVT::v2i32, Expand);
  59   setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Expand);
  60   setOperationAction(ISD::UDIV, MVT::v4i32, Expand);
  61   setOperationAction(ISD::UREM, MVT::v4i32, Expand);
  62   setOperationAction(ISD::SETCC, MVT::v4i32, Expand);
  63   setOperationAction(ISD::XOR, MVT::v4i32, Expand);
  64   setOperationAction(ISD::XOR, MVT::v2i32, Expand);
  65
  66   setOperationAction(ISD::BR_CC, MVT::i32, Expand);
  67   setOperationAction(ISD::BR_CC, MVT::f32, Expand);
  68
  69   setOperationAction(ISD::FSUB, MVT::f32, Expand);
  70
  71   setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
  72   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
  73   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i1, Custom);
  74
  75   setOperationAction(ISD::ROTL, MVT::i32, Custom);
  76
  77   setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
  78   setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
  79
  80   setOperationAction(ISD::SETCC, MVT::i32, Expand);
  81   setOperationAction(ISD::SETCC, MVT::f32, Expand);
  82   setOperationAction(ISD::FP_TO_UINT, MVT::i1, Custom);
  83
  84   setOperationAction(ISD::SELECT, MVT::i32, Custom);
  85   setOperationAction(ISD::SELECT, MVT::f32, Custom);
  86
  87   setOperationAction(ISD::VSELECT, MVT::v4i32, Expand);
  88   setOperationAction(ISD::VSELECT, MVT::v2i32, Expand);
  89
  90   // Legalize loads and stores to the private address space.
  91   setOperationAction(ISD::LOAD, MVT::i32, Custom);
  92   setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
  93   setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
  94   setLoadExtAction(ISD::EXTLOAD, MVT::v4i8, Custom);
  95   setLoadExtAction(ISD::EXTLOAD, MVT::i8, Custom);
  96   setLoadExtAction(ISD::ZEXTLOAD, MVT::i8, Custom);
  97   setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i8, Custom);
  98   setOperationAction(ISD::STORE, MVT::i8, Custom);
  99   setOperationAction(ISD::STORE, MVT::i32, Custom);
 100   setOperationAction(ISD::STORE, MVT::v2i32, Custom);
 101   setOperationAction(ISD::STORE, MVT::v4i32, Custom);
 102
 103   setOperationAction(ISD::LOAD, MVT::i32, Custom);
 104   setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
 105   setOperationAction(ISD::FrameIndex, MVT::i32, Custom);
 106
 107   setTargetDAGCombine(ISD::FP_ROUND);
 108   setTargetDAGCombine(ISD::FP_TO_SINT);
 109   setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
 110   setTargetDAGCombine(ISD::SELECT_CC);
 111
 112   setBooleanContents(ZeroOrNegativeOneBooleanContent);
 113   setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
 114   setSchedulingPreference(Sched::VLIW);
 115 }
 116
 117 MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter(
 118     MachineInstr * MI, MachineBasicBlock * BB) const {
 119   MachineFunction * MF = BB->getParent();
 120   MachineRegisterInfo &MRI = MF->getRegInfo();
 121   MachineBasicBlock::iterator I = *MI;
 122
 123   switch (MI->getOpcode()) {
 124   default: return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
 125   case AMDGPU::CLAMP_R600: {
 126     MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
 127                                                    AMDGPU::MOV,
 128                                                    MI->getOperand(0).getReg(),
 129                                                    MI->getOperand(1).getReg());
 130     TII->addFlag(NewMI, 0, MO_FLAG_CLAMP);
 131     break;
 132   }
 133
 134   case AMDGPU::FABS_R600: {
 135     MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
 136                                                     AMDGPU::MOV,
 137                                                     MI->getOperand(0).getReg(),
 138                                                     MI->getOperand(1).getReg());
 139     TII->addFlag(NewMI, 0, MO_FLAG_ABS);
 140     break;
 141   }
 142
 143   case AMDGPU::FNEG_R600: {
 144     MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
 145                                                     AMDGPU::MOV,
 146                                                     MI->getOperand(0).getReg(),
 147                                                     MI->getOperand(1).getReg());
 148     TII->addFlag(NewMI, 0, MO_FLAG_NEG);
 149     break;
 150   }
 151
 152   case AMDGPU::MASK_WRITE: {
 153     unsigned maskedRegister = MI->getOperand(0).getReg();
 154     assert(TargetRegisterInfo::isVirtualRegister(maskedRegister));
 155     MachineInstr * defInstr = MRI.getVRegDef(maskedRegister);
 156     TII->addFlag(defInstr, 0, MO_FLAG_MASK);
 157     break;
 158   }
 159
 160   case AMDGPU::MOV_IMM_F32:
 161     TII->buildMovImm(*BB, I, MI->getOperand(0).getReg(),
 162                      MI->getOperand(1).getFPImm()->getValueAPF()
 163                          .bitcastToAPInt().getZExtValue());
 164     break;
 165   case AMDGPU::MOV_IMM_I32:
 166     TII->buildMovImm(*BB, I, MI->getOperand(0).getReg(),
 167                      MI->getOperand(1).getImm());
 168     break;
 169   case AMDGPU::CONST_COPY: {
 170     MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, MI, AMDGPU::MOV,
 171         MI->getOperand(0).getReg(), AMDGPU::ALU_CONST);
 172     TII->setImmOperand(NewMI, R600Operands::SRC0_SEL,
 173         MI->getOperand(1).getImm());
 174     break;
 175   }
 176
 177   case AMDGPU::RAT_WRITE_CACHELESS_32_eg:
 178   case AMDGPU::RAT_WRITE_CACHELESS_128_eg: {
 179     unsigned EOP = (llvm::next(I)->getOpcode() == AMDGPU::RETURN) ? 1 : 0;
 180
 181     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode()))
 182             .addOperand(MI->getOperand(0))
 183             .addOperand(MI->getOperand(1))
 184             .addImm(EOP); // Set End of program bit
 185     break;
 186   }
 187
 188   case AMDGPU::TXD: {
 189     unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
 190     unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
 191
 192     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), T0)
 193             .addOperand(MI->getOperand(3))
 194             .addOperand(MI->getOperand(4))
 195             .addOperand(MI->getOperand(5))
 196             .addOperand(MI->getOperand(6));
 197     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), T1)
 198             .addOperand(MI->getOperand(2))
 199             .addOperand(MI->getOperand(4))
 200             .addOperand(MI->getOperand(5))
 201             .addOperand(MI->getOperand(6));
 202     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_G))
 203             .addOperand(MI->getOperand(0))
 204             .addOperand(MI->getOperand(1))
 205             .addOperand(MI->getOperand(4))
 206             .addOperand(MI->getOperand(5))
 207             .addOperand(MI->getOperand(6))
 208             .addReg(T0, RegState::Implicit)
 209             .addReg(T1, RegState::Implicit);
 210     break;
 211   }
 212
 213   case AMDGPU::TXD_SHADOW: {
 214     unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
 215     unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
 216
 217     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), T0)
 218             .addOperand(MI->getOperand(3))
 219             .addOperand(MI->getOperand(4))
 220             .addOperand(MI->getOperand(5))
 221             .addOperand(MI->getOperand(6));
 222     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), T1)
 223             .addOperand(MI->getOperand(2))
 224             .addOperand(MI->getOperand(4))
 225             .addOperand(MI->getOperand(5))
 226             .addOperand(MI->getOperand(6));
 227     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_C_G))
 228             .addOperand(MI->getOperand(0))
 229             .addOperand(MI->getOperand(1))
 230             .addOperand(MI->getOperand(4))
 231             .addOperand(MI->getOperand(5))
 232             .addOperand(MI->getOperand(6))
 233             .addReg(T0, RegState::Implicit)
 234             .addReg(T1, RegState::Implicit);
 235     break;
 236   }
 237
 238   case AMDGPU::BRANCH:
 239       BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP))
 240               .addOperand(MI->getOperand(0));
 241       break;
 242
 243   case AMDGPU::BRANCH_COND_f32: {
 244     MachineInstr *NewMI =
 245       BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X),
 246               AMDGPU::PREDICATE_BIT)
 247               .addOperand(MI->getOperand(1))
 248               .addImm(OPCODE_IS_NOT_ZERO)
 249               .addImm(0); // Flags
 250     TII->addFlag(NewMI, 0, MO_FLAG_PUSH);
 251     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND))
 252             .addOperand(MI->getOperand(0))
 253             .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
 254     break;
 255   }
 256
 257   case AMDGPU::BRANCH_COND_i32: {
 258     MachineInstr *NewMI =
 259       BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X),
 260             AMDGPU::PREDICATE_BIT)
 261             .addOperand(MI->getOperand(1))
 262             .addImm(OPCODE_IS_NOT_ZERO_INT)
 263             .addImm(0); // Flags
 264     TII->addFlag(NewMI, 0, MO_FLAG_PUSH);
 265     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND))
 266            .addOperand(MI->getOperand(0))
 267             .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
 268     break;
 269   }
 270
 271   case AMDGPU::EG_ExportSwz:
 272   case AMDGPU::R600_ExportSwz: {
 273     // Instruction is left unmodified if its not the last one of its type
 274     bool isLastInstructionOfItsType = true;
 275     unsigned InstExportType = MI->getOperand(1).getImm();
 276     for (MachineBasicBlock::iterator NextExportInst = llvm::next(I),
 277          EndBlock = BB->end(); NextExportInst != EndBlock;
 278          NextExportInst = llvm::next(NextExportInst)) {
 279       if (NextExportInst->getOpcode() == AMDGPU::EG_ExportSwz ||
 280           NextExportInst->getOpcode() == AMDGPU::R600_ExportSwz) {
 281         unsigned CurrentInstExportType = NextExportInst->getOperand(1)
 282             .getImm();
 283         if (CurrentInstExportType == InstExportType) {
 284           isLastInstructionOfItsType = false;
 285           break;
 286         }
 287       }
 288     }
 289     bool EOP = (llvm::next(I)->getOpcode() == AMDGPU::RETURN)? 1 : 0;
 290     if (!EOP && !isLastInstructionOfItsType)
 291       return BB;
 292     unsigned CfInst = (MI->getOpcode() == AMDGPU::EG_ExportSwz)? 84 : 40;
 293     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode()))
 294             .addOperand(MI->getOperand(0))
 295             .addOperand(MI->getOperand(1))
 296             .addOperand(MI->getOperand(2))
 297             .addOperand(MI->getOperand(3))
 298             .addOperand(MI->getOperand(4))
 299             .addOperand(MI->getOperand(5))
 300             .addOperand(MI->getOperand(6))
 301             .addImm(CfInst)
 302             .addImm(EOP);
 303     break;
 304   }
 305   case AMDGPU::RETURN: {
 306     // RETURN instructions must have the live-out registers as implicit uses,
 307     // otherwise they appear dead.
 308     R600MachineFunctionInfo *MFI = MF->getInfo<R600MachineFunctionInfo>();
 309     MachineInstrBuilder MIB(*MF, MI);
 310     for (unsigned i = 0, e = MFI->LiveOuts.size(); i != e; ++i)
 311       MIB.addReg(MFI->LiveOuts[i], RegState::Implicit);
 312     return BB;
 313   }
 314   }
 315
 316   MI->eraseFromParent();
 317   return BB;
 318 }
 319
 320 //===----------------------------------------------------------------------===//
 321 // Custom DAG Lowering Operations
 322 //===----------------------------------------------------------------------===//
 323
 324 using namespace llvm::Intrinsic;
 325 using namespace llvm::AMDGPUIntrinsic;
 326
 327 SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
 328   switch (Op.getOpcode()) {
 329   default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
 330   case ISD::ROTL: return LowerROTL(Op, DAG);
 331   case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
 332   case ISD::SELECT: return LowerSELECT(Op, DAG);
 333   case ISD::STORE: return LowerSTORE(Op, DAG);
 334   case ISD::LOAD: return LowerLOAD(Op, DAG);
 335   case ISD::FrameIndex: return LowerFrameIndex(Op, DAG);
 336   case ISD::INTRINSIC_VOID: {
 337     SDValue Chain = Op.getOperand(0);
 338     unsigned IntrinsicID =
 339                          cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
 340     switch (IntrinsicID) {
 341     case AMDGPUIntrinsic::AMDGPU_store_output: {
 342       MachineFunction &MF = DAG.getMachineFunction();
 343       R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
 344       int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
 345       unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister(RegIndex);
 346       MFI->LiveOuts.push_back(Reg);
 347       return DAG.getCopyToReg(Chain, Op.getDebugLoc(), Reg, Op.getOperand(2));
 348     }
 349     case AMDGPUIntrinsic::R600_store_swizzle: {
 350       const SDValue Args[8] = {
 351         Chain,
 352         Op.getOperand(2), // Export Value
 353         Op.getOperand(3), // ArrayBase
 354         Op.getOperand(4), // Type
 355         DAG.getConstant(0, MVT::i32), // SWZ_X
 356         DAG.getConstant(1, MVT::i32), // SWZ_Y
 357         DAG.getConstant(2, MVT::i32), // SWZ_Z
 358         DAG.getConstant(3, MVT::i32) // SWZ_W
 359       };
 360       return DAG.getNode(AMDGPUISD::EXPORT, Op.getDebugLoc(), Op.getValueType(),
 361           Args, 8);
 362     }
 363
 364     // default for switch(IntrinsicID)
 365     default: break;
 366     }
 367     // break out of case ISD::INTRINSIC_VOID in switch(Op.getOpcode())
 368     break;
 369   }
 370   case ISD::INTRINSIC_WO_CHAIN: {
 371     unsigned IntrinsicID =
 372                          cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
 373     EVT VT = Op.getValueType();
 374     DebugLoc DL = Op.getDebugLoc();
 375     switch(IntrinsicID) {
 376     default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
 377     case AMDGPUIntrinsic::R600_load_input: {
 378       int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
 379       unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister(RegIndex);
 380       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass, Reg, VT);
 381     }
 382
 383     case AMDGPUIntrinsic::R600_interp_input: {
 384       int slot = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
 385       int ijb = cast<ConstantSDNode>(Op.getOperand(2))->getSExtValue();
 386       MachineSDNode *interp;
 387       if (ijb < 0) {
 388         interp = DAG.getMachineNode(AMDGPU::INTERP_VEC_LOAD, DL,
 389             MVT::v4f32, DAG.getTargetConstant(slot / 4 , MVT::i32));
 390         return DAG.getTargetExtractSubreg(
 391             TII->getRegisterInfo().getSubRegFromChannel(slot % 4),
 392             DL, MVT::f32, SDValue(interp, 0));
 393       }
 394
 395       if (slot % 4 < 2)
 396         interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_XY, DL,
 397             MVT::f32, MVT::f32, DAG.getTargetConstant(slot / 4 , MVT::i32),
 398             CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 399                 AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb + 1), MVT::f32),
 400             CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 401                 AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb), MVT::f32));
 402       else
 403         interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_ZW, DL,
 404             MVT::f32, MVT::f32, DAG.getTargetConstant(slot / 4 , MVT::i32),
 405             CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 406                 AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb + 1), MVT::f32),
 407             CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 408                 AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb), MVT::f32));
 409
 410       return SDValue(interp, slot % 2);
 411     }
 412
 413     case r600_read_ngroups_x:
 414       return LowerImplicitParameter(DAG, VT, DL, 0);
 415     case r600_read_ngroups_y:
 416       return LowerImplicitParameter(DAG, VT, DL, 1);
 417     case r600_read_ngroups_z:
 418       return LowerImplicitParameter(DAG, VT, DL, 2);
 419     case r600_read_global_size_x:
 420       return LowerImplicitParameter(DAG, VT, DL, 3);
 421     case r600_read_global_size_y:
 422       return LowerImplicitParameter(DAG, VT, DL, 4);
 423     case r600_read_global_size_z:
 424       return LowerImplicitParameter(DAG, VT, DL, 5);
 425     case r600_read_local_size_x:
 426       return LowerImplicitParameter(DAG, VT, DL, 6);
 427     case r600_read_local_size_y:
 428       return LowerImplicitParameter(DAG, VT, DL, 7);
 429     case r600_read_local_size_z:
 430       return LowerImplicitParameter(DAG, VT, DL, 8);
 431
 432     case r600_read_tgid_x:
 433       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 434                                   AMDGPU::T1_X, VT);
 435     case r600_read_tgid_y:
 436       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 437                                   AMDGPU::T1_Y, VT);
 438     case r600_read_tgid_z:
 439       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 440                                   AMDGPU::T1_Z, VT);
 441     case r600_read_tidig_x:
 442       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 443                                   AMDGPU::T0_X, VT);
 444     case r600_read_tidig_y:
 445       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 446                                   AMDGPU::T0_Y, VT);
 447     case r600_read_tidig_z:
 448       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 449                                   AMDGPU::T0_Z, VT);
 450     }
 451     // break out of case ISD::INTRINSIC_WO_CHAIN in switch(Op.getOpcode())
 452     break;
 453   }
 454   } // end switch(Op.getOpcode())
 455   return SDValue();
 456 }
 457
 458 void R600TargetLowering::ReplaceNodeResults(SDNode *N,
 459                                             SmallVectorImpl<SDValue> &Results,
 460                                             SelectionDAG &DAG) const {
 461   switch (N->getOpcode()) {
 462   default: return;
 463   case ISD::FP_TO_UINT: Results.push_back(LowerFPTOUINT(N->getOperand(0), DAG));
 464     return;
 465   case ISD::LOAD: {
 466     SDNode *Node = LowerLOAD(SDValue(N, 0), DAG).getNode();
 467     Results.push_back(SDValue(Node, 0));
 468     Results.push_back(SDValue(Node, 1));
 469     // XXX: LLVM seems not to replace Chain Value inside CustomWidenLowerNode
 470     // function
 471     DAG.ReplaceAllUsesOfValueWith(SDValue(N,1), SDValue(Node, 1));
 472     return;
 473   }
 474   case ISD::STORE:
 475     SDNode *Node = LowerSTORE(SDValue(N, 0), DAG).getNode();
 476     Results.push_back(SDValue(Node, 0));
 477     return;
 478   }
 479 }
 480
 481 SDValue R600TargetLowering::LowerFPTOUINT(SDValue Op, SelectionDAG &DAG) const {
 482   return DAG.getNode(
 483       ISD::SETCC,
 484       Op.getDebugLoc(),
 485       MVT::i1,
 486       Op, DAG.getConstantFP(0.0f, MVT::f32),
 487       DAG.getCondCode(ISD::SETNE)
 488       );
 489 }
 490
 491 SDValue R600TargetLowering::LowerImplicitParameter(SelectionDAG &DAG, EVT VT,
 492                                                    DebugLoc DL,
 493                                                    unsigned DwordOffset) const {
 494   unsigned ByteOffset = DwordOffset * 4;
 495   PointerType * PtrType = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
 496                                       AMDGPUAS::PARAM_I_ADDRESS);
 497
 498   // We shouldn't be using an offset wider than 16-bits for implicit parameters.
 499   assert(isInt<16>(ByteOffset));
 500
 501   return DAG.getLoad(VT, DL, DAG.getEntryNode(),
 502                      DAG.getConstant(ByteOffset, MVT::i32), // PTR
 503                      MachinePointerInfo(ConstantPointerNull::get(PtrType)),
 504                      false, false, false, 0);
 505 }
 506
 507 SDValue R600TargetLowering::LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const {
 508
 509   MachineFunction &MF = DAG.getMachineFunction();
 510   const AMDGPUFrameLowering *TFL =
 511    static_cast<const AMDGPUFrameLowering*>(getTargetMachine().getFrameLowering());
 512
 513   FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Op);
 514   assert(FIN);
 515
 516   unsigned FrameIndex = FIN->getIndex();
 517   unsigned Offset = TFL->getFrameIndexOffset(MF, FrameIndex);
 518   return DAG.getConstant(Offset * 4 * TFL->getStackWidth(MF), MVT::i32);
 519 }
 520
 521 SDValue R600TargetLowering::LowerROTL(SDValue Op, SelectionDAG &DAG) const {
 522   DebugLoc DL = Op.getDebugLoc();
 523   EVT VT = Op.getValueType();
 524
 525   return DAG.getNode(AMDGPUISD::BITALIGN, DL, VT,
 526                      Op.getOperand(0),
 527                      Op.getOperand(0),
 528                      DAG.getNode(ISD::SUB, DL, VT,
 529                                  DAG.getConstant(32, MVT::i32),
 530                                  Op.getOperand(1)));
 531 }
 532
 533 bool R600TargetLowering::isZero(SDValue Op) const {
 534   if(ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
 535     return Cst->isNullValue();
 536   } else if(ConstantFPSDNode *CstFP = dyn_cast<ConstantFPSDNode>(Op)){
 537     return CstFP->isZero();
 538   } else {
 539     return false;
 540   }
 541 }
 542
 543 SDValue R600TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
 544   DebugLoc DL = Op.getDebugLoc();
 545   EVT VT = Op.getValueType();
 546
 547   SDValue LHS = Op.getOperand(0);
 548   SDValue RHS = Op.getOperand(1);
 549   SDValue True = Op.getOperand(2);
 550   SDValue False = Op.getOperand(3);
 551   SDValue CC = Op.getOperand(4);
 552   SDValue Temp;
 553
 554   // LHS and RHS are guaranteed to be the same value type
 555   EVT CompareVT = LHS.getValueType();
 556
 557   // Check if we can lower this to a native operation.
 558
 559   // Try to lower to a SET* instruction:
 560   //
 561   // SET* can match the following patterns:
 562   //
 563   // select_cc f32, f32, -1,  0, cc_any
 564   // select_cc f32, f32, 1.0f, 0.0f, cc_any
 565   // select_cc i32, i32, -1,  0, cc_any
 566   //
 567
 568   // Move hardware True/False values to the correct operand.
 569   if (isHWTrueValue(False) && isHWFalseValue(True)) {
 570     ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
 571     std::swap(False, True);
 572     CC = DAG.getCondCode(ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32));
 573   }
 574
 575   if (isHWTrueValue(True) && isHWFalseValue(False) &&
 576       (CompareVT == VT || VT == MVT::i32)) {
 577     // This can be matched by a SET* instruction.
 578     return DAG.getNode(ISD::SELECT_CC, DL, VT, LHS, RHS, True, False, CC);
 579   }
 580
 581   // Try to lower to a CND* instruction:
 582   //
 583   // CND* can match the following patterns:
 584   //
 585   // select_cc f32, 0.0, f32, f32, cc_any
 586   // select_cc f32, 0.0, i32, i32, cc_any
 587   // select_cc i32, 0,   f32, f32, cc_any
 588   // select_cc i32, 0,   i32, i32, cc_any
 589   //
 590   if (isZero(LHS) || isZero(RHS)) {
 591     SDValue Cond = (isZero(LHS) ? RHS : LHS);
 592     SDValue Zero = (isZero(LHS) ? LHS : RHS);
 593     ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
 594     if (CompareVT != VT) {
 595       // Bitcast True / False to the correct types.  This will end up being
 596       // a nop, but it allows us to define only a single pattern in the
 597       // .TD files for each CND* instruction rather than having to have
 598       // one pattern for integer True/False and one for fp True/False
 599       True = DAG.getNode(ISD::BITCAST, DL, CompareVT, True);
 600       False = DAG.getNode(ISD::BITCAST, DL, CompareVT, False);
 601     }
 602     if (isZero(LHS)) {
 603       CCOpcode = ISD::getSetCCSwappedOperands(CCOpcode);
 604     }
 605
 606     switch (CCOpcode) {
 607     case ISD::SETONE:
 608     case ISD::SETUNE:
 609     case ISD::SETNE:
 610     case ISD::SETULE:
 611     case ISD::SETULT:
 612     case ISD::SETOLE:
 613     case ISD::SETOLT:
 614     case ISD::SETLE:
 615     case ISD::SETLT:
 616       CCOpcode = ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32);
 617       Temp = True;
 618       True = False;
 619       False = Temp;
 620       break;
 621     default:
 622       break;
 623     }
 624     SDValue SelectNode = DAG.getNode(ISD::SELECT_CC, DL, CompareVT,
 625         Cond, Zero,
 626         True, False,
 627         DAG.getCondCode(CCOpcode));
 628     return DAG.getNode(ISD::BITCAST, DL, VT, SelectNode);
 629   }
 630
 631
 632   // Possible Min/Max pattern
 633   SDValue MinMax = LowerMinMax(Op, DAG);
 634   if (MinMax.getNode()) {
 635     return MinMax;
 636   }
 637
 638   // If we make it this for it means we have no native instructions to handle
 639   // this SELECT_CC, so we must lower it.
 640   SDValue HWTrue, HWFalse;
 641
 642   if (CompareVT == MVT::f32) {
 643     HWTrue = DAG.getConstantFP(1.0f, CompareVT);
 644     HWFalse = DAG.getConstantFP(0.0f, CompareVT);
 645   } else if (CompareVT == MVT::i32) {
 646     HWTrue = DAG.getConstant(-1, CompareVT);
 647     HWFalse = DAG.getConstant(0, CompareVT);
 648   }
 649   else {
 650     assert(!"Unhandled value type in LowerSELECT_CC");
 651   }
 652
 653   // Lower this unsupported SELECT_CC into a combination of two supported
 654   // SELECT_CC operations.
 655   SDValue Cond = DAG.getNode(ISD::SELECT_CC, DL, CompareVT, LHS, RHS, HWTrue, HWFalse, CC);
 656
 657   return DAG.getNode(ISD::SELECT_CC, DL, VT,
 658       Cond, HWFalse,
 659       True, False,
 660       DAG.getCondCode(ISD::SETNE));
 661 }
 662
 663 SDValue R600TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
 664   return DAG.getNode(ISD::SELECT_CC,
 665       Op.getDebugLoc(),
 666       Op.getValueType(),
 667       Op.getOperand(0),
 668       DAG.getConstant(0, MVT::i32),
 669       Op.getOperand(1),
 670       Op.getOperand(2),
 671       DAG.getCondCode(ISD::SETNE));
 672 }
 673
 674 /// LLVM generates byte-addresed pointers.  For indirect addressing, we need to
 675 /// convert these pointers to a register index.  Each register holds
 676 /// 16 bytes, (4 x 32bit sub-register), but we need to take into account the
 677 /// \p StackWidth, which tells us how many of the 4 sub-registrers will be used
 678 /// for indirect addressing.
 679 SDValue R600TargetLowering::stackPtrToRegIndex(SDValue Ptr,
 680                                                unsigned StackWidth,
 681                                                SelectionDAG &DAG) const {
 682   unsigned SRLPad;
 683   switch(StackWidth) {
 684   case 1:
 685     SRLPad = 2;
 686     break;
 687   case 2:
 688     SRLPad = 3;
 689     break;
 690   case 4:
 691     SRLPad = 4;
 692     break;
 693   default: llvm_unreachable("Invalid stack width");
 694   }
 695
 696   return DAG.getNode(ISD::SRL, Ptr.getDebugLoc(), Ptr.getValueType(), Ptr,
 697                      DAG.getConstant(SRLPad, MVT::i32));
 698 }
 699
 700 void R600TargetLowering::getStackAddress(unsigned StackWidth,
 701                                          unsigned ElemIdx,
 702                                          unsigned &Channel,
 703                                          unsigned &PtrIncr) const {
 704   switch (StackWidth) {
 705   default:
 706   case 1:
 707     Channel = 0;
 708     if (ElemIdx > 0) {
 709       PtrIncr = 1;
 710     } else {
 711       PtrIncr = 0;
 712     }
 713     break;
 714   case 2:
 715     Channel = ElemIdx % 2;
 716     if (ElemIdx == 2) {
 717       PtrIncr = 1;
 718     } else {
 719       PtrIncr = 0;
 720     }
 721     break;
 722   case 4:
 723     Channel = ElemIdx;
 724     PtrIncr = 0;
 725     break;
 726   }
 727 }
 728
 729 SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
 730   DebugLoc DL = Op.getDebugLoc();
 731   StoreSDNode *StoreNode = cast<StoreSDNode>(Op);
 732   SDValue Chain = Op.getOperand(0);
 733   SDValue Value = Op.getOperand(1);
 734   SDValue Ptr = Op.getOperand(2);
 735
 736   if (StoreNode->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS &&
 737       Ptr->getOpcode() != AMDGPUISD::DWORDADDR) {
 738     // Convert pointer from byte address to dword address.
 739     Ptr = DAG.getNode(AMDGPUISD::DWORDADDR, DL, Ptr.getValueType(),
 740                       DAG.getNode(ISD::SRL, DL, Ptr.getValueType(),
 741                                   Ptr, DAG.getConstant(2, MVT::i32)));
 742
 743     if (StoreNode->isTruncatingStore() || StoreNode->isIndexed()) {
 744       assert(!"Truncated and indexed stores not supported yet");
 745     } else {
 746       Chain = DAG.getStore(Chain, DL, Value, Ptr, StoreNode->getMemOperand());
 747     }
 748     return Chain;
 749   }
 750
 751   EVT ValueVT = Value.getValueType();
 752
 753   if (StoreNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) {
 754     return SDValue();
 755   }
 756
 757   // Lowering for indirect addressing
 758
 759   const MachineFunction &MF = DAG.getMachineFunction();
 760   const AMDGPUFrameLowering *TFL = static_cast<const AMDGPUFrameLowering*>(
 761                                          getTargetMachine().getFrameLowering());
 762   unsigned StackWidth = TFL->getStackWidth(MF);
 763
 764   Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG);
 765
 766   if (ValueVT.isVector()) {
 767     unsigned NumElemVT = ValueVT.getVectorNumElements();
 768     EVT ElemVT = ValueVT.getVectorElementType();
 769     SDValue Stores[4];
 770
 771     assert(NumElemVT >= StackWidth && "Stack width cannot be greater than "
 772                                       "vector width in load");
 773
 774     for (unsigned i = 0; i < NumElemVT; ++i) {
 775       unsigned Channel, PtrIncr;
 776       getStackAddress(StackWidth, i, Channel, PtrIncr);
 777       Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr,
 778                         DAG.getConstant(PtrIncr, MVT::i32));
 779       SDValue Elem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ElemVT,
 780                                  Value, DAG.getConstant(i, MVT::i32));
 781
 782       Stores[i] = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other,
 783                               Chain, Elem, Ptr,
 784                               DAG.getTargetConstant(Channel, MVT::i32));
 785     }
 786      Chain =  DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores, NumElemVT);
 787    } else {
 788     if (ValueVT == MVT::i8) {
 789       Value = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Value);
 790     }
 791     Chain = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other, Chain, Value, Ptr,
 792     DAG.getTargetConstant(0, MVT::i32)); // Channel
 793   }
 794
 795   return Chain;
 796 }
 797
 798 // return (512 + (kc_bank << 12)
 799 static int
 800 ConstantAddressBlock(unsigned AddressSpace) {
 801   switch (AddressSpace) {
 802   case AMDGPUAS::CONSTANT_BUFFER_0:
 803     return 512;
 804   case AMDGPUAS::CONSTANT_BUFFER_1:
 805     return 512 + 4096;
 806   case AMDGPUAS::CONSTANT_BUFFER_2:
 807     return 512 + 4096 * 2;
 808   case AMDGPUAS::CONSTANT_BUFFER_3:
 809     return 512 + 4096 * 3;
 810   case AMDGPUAS::CONSTANT_BUFFER_4:
 811     return 512 + 4096 * 4;
 812   case AMDGPUAS::CONSTANT_BUFFER_5:
 813     return 512 + 4096 * 5;
 814   case AMDGPUAS::CONSTANT_BUFFER_6:
 815     return 512 + 4096 * 6;
 816   case AMDGPUAS::CONSTANT_BUFFER_7:
 817     return 512 + 4096 * 7;
 818   case AMDGPUAS::CONSTANT_BUFFER_8:
 819     return 512 + 4096 * 8;
 820   case AMDGPUAS::CONSTANT_BUFFER_9:
 821     return 512 + 4096 * 9;
 822   case AMDGPUAS::CONSTANT_BUFFER_10:
 823     return 512 + 4096 * 10;
 824   case AMDGPUAS::CONSTANT_BUFFER_11:
 825     return 512 + 4096 * 11;
 826   case AMDGPUAS::CONSTANT_BUFFER_12:
 827     return 512 + 4096 * 12;
 828   case AMDGPUAS::CONSTANT_BUFFER_13:
 829     return 512 + 4096 * 13;
 830   case AMDGPUAS::CONSTANT_BUFFER_14:
 831     return 512 + 4096 * 14;
 832   case AMDGPUAS::CONSTANT_BUFFER_15:
 833     return 512 + 4096 * 15;
 834   default:
 835     return -1;
 836   }
 837 }
 838
 839 SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const
 840 {
 841   EVT VT = Op.getValueType();
 842   DebugLoc DL = Op.getDebugLoc();
 843   LoadSDNode *LoadNode = cast<LoadSDNode>(Op);
 844   SDValue Chain = Op.getOperand(0);
 845   SDValue Ptr = Op.getOperand(1);
 846   SDValue LoweredLoad;
 847
 848   int ConstantBlock = ConstantAddressBlock(LoadNode->getAddressSpace());
 849   if (ConstantBlock > -1) {
 850     SDValue Result;
 851     if (dyn_cast<ConstantExpr>(LoadNode->getSrcValue()) ||
 852         dyn_cast<Constant>(LoadNode->getSrcValue()) ||
 853         dyn_cast<ConstantSDNode>(Ptr)) {
 854       SDValue Slots[4];
 855       for (unsigned i = 0; i < 4; i++) {
 856         // We want Const position encoded with the following formula :
 857         // (((512 + (kc_bank << 12) + const_index) << 2) + chan)
 858         // const_index is Ptr computed by llvm using an alignment of 16.
 859         // Thus we add (((512 + (kc_bank << 12)) + chan ) * 4 here and
 860         // then div by 4 at the ISel step
 861         SDValue NewPtr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
 862             DAG.getConstant(4 * i + ConstantBlock * 16, MVT::i32));
 863         Slots[i] = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::i32, NewPtr);
 864       }
 865       Result = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v4i32, Slots, 4);
 866     } else {
 867       // non constant ptr cant be folded, keeps it as a v4f32 load
 868       Result = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::v4i32,
 869           DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr, DAG.getConstant(4, MVT::i32)),
 870           DAG.getConstant(LoadNode->getAddressSpace() -
 871                           AMDGPUAS::CONSTANT_BUFFER_0, MVT::i32)
 872           );
 873     }
 874
 875     if (!VT.isVector()) {
 876       Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Result,
 877           DAG.getConstant(0, MVT::i32));
 878     }
 879
 880     SDValue MergedValues[2] = {
 881         Result,
 882         Chain
 883     };
 884     return DAG.getMergeValues(MergedValues, 2, DL);
 885   }
 886
 887   if (LoadNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) {
 888     return SDValue();
 889   }
 890
 891   // Lowering for indirect addressing
 892   const MachineFunction &MF = DAG.getMachineFunction();
 893   const AMDGPUFrameLowering *TFL = static_cast<const AMDGPUFrameLowering*>(
 894                                          getTargetMachine().getFrameLowering());
 895   unsigned StackWidth = TFL->getStackWidth(MF);
 896
 897   Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG);
 898
 899   if (VT.isVector()) {
 900     unsigned NumElemVT = VT.getVectorNumElements();
 901     EVT ElemVT = VT.getVectorElementType();
 902     SDValue Loads[4];
 903
 904     assert(NumElemVT >= StackWidth && "Stack width cannot be greater than "
 905                                       "vector width in load");
 906
 907     for (unsigned i = 0; i < NumElemVT; ++i) {
 908       unsigned Channel, PtrIncr;
 909       getStackAddress(StackWidth, i, Channel, PtrIncr);
 910       Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr,
 911                         DAG.getConstant(PtrIncr, MVT::i32));
 912       Loads[i] = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, ElemVT,
 913                              Chain, Ptr,
 914                              DAG.getTargetConstant(Channel, MVT::i32),
 915                              Op.getOperand(2));
 916     }
 917     for (unsigned i = NumElemVT; i < 4; ++i) {
 918       Loads[i] = DAG.getUNDEF(ElemVT);
 919     }
 920     EVT TargetVT = EVT::getVectorVT(*DAG.getContext(), ElemVT, 4);
 921     LoweredLoad = DAG.getNode(ISD::BUILD_VECTOR, DL, TargetVT, Loads, 4);
 922   } else {
 923     LoweredLoad = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, VT,
 924                               Chain, Ptr,
 925                               DAG.getTargetConstant(0, MVT::i32), // Channel
 926                               Op.getOperand(2));
 927   }
 928
 929   SDValue Ops[2];
 930   Ops[0] = LoweredLoad;
 931   Ops[1] = Chain;
 932
 933   return DAG.getMergeValues(Ops, 2, DL);
 934 }
 935
 936 /// XXX Only kernel functions are supported, so we can assume for now that
 937 /// every function is a kernel function, but in the future we should use
 938 /// separate calling conventions for kernel and non-kernel functions.
 939 SDValue R600TargetLowering::LowerFormalArguments(
 940                                       SDValue Chain,
 941                                       CallingConv::ID CallConv,
 942                                       bool isVarArg,
 943                                       const SmallVectorImpl<ISD::InputArg> &Ins,
 944                                       DebugLoc DL, SelectionDAG &DAG,
 945                                       SmallVectorImpl<SDValue> &InVals) const {
 946   unsigned ParamOffsetBytes = 36;
 947   Function::const_arg_iterator FuncArg =
 948                             DAG.getMachineFunction().getFunction()->arg_begin();
 949   for (unsigned i = 0, e = Ins.size(); i < e; ++i, ++FuncArg) {
 950     EVT VT = Ins[i].VT;
 951     Type *ArgType = FuncArg->getType();
 952     unsigned ArgSizeInBits = ArgType->isPointerTy() ?
 953                              32 : ArgType->getPrimitiveSizeInBits();
 954     unsigned ArgBytes = ArgSizeInBits >> 3;
 955     EVT ArgVT;
 956     if (ArgSizeInBits < VT.getSizeInBits()) {
 957       assert(!ArgType->isFloatTy() &&
 958              "Extending floating point arguments not supported yet");
 959       ArgVT = MVT::getIntegerVT(ArgSizeInBits);
 960     } else {
 961       ArgVT = VT;
 962     }
 963     PointerType *PtrTy = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
 964                                                     AMDGPUAS::PARAM_I_ADDRESS);
 965     SDValue Arg = DAG.getExtLoad(ISD::ZEXTLOAD, DL, VT, DAG.getRoot(),
 966                                 DAG.getConstant(ParamOffsetBytes, MVT::i32),
 967                                        MachinePointerInfo(UndefValue::get(PtrTy)),
 968                                        ArgVT, false, false, ArgBytes);
 969     InVals.push_back(Arg);
 970     ParamOffsetBytes += ArgBytes;
 971   }
 972   return Chain;
 973 }
 974
 975 EVT R600TargetLowering::getSetCCResultType(EVT VT) const {
 976    if (!VT.isVector()) return MVT::i32;
 977    return VT.changeVectorElementTypeToInteger();
 978 }
 979
 980 //===----------------------------------------------------------------------===//
 981 // Custom DAG Optimizations
 982 //===----------------------------------------------------------------------===//
 983
 984 SDValue R600TargetLowering::PerformDAGCombine(SDNode *N,
 985                                               DAGCombinerInfo &DCI) const {
 986   SelectionDAG &DAG = DCI.DAG;
 987
 988   switch (N->getOpcode()) {
 989   // (f32 fp_round (f64 uint_to_fp a)) -> (f32 uint_to_fp a)
 990   case ISD::FP_ROUND: {
 991       SDValue Arg = N->getOperand(0);
 992       if (Arg.getOpcode() == ISD::UINT_TO_FP && Arg.getValueType() == MVT::f64) {
 993         return DAG.getNode(ISD::UINT_TO_FP, N->getDebugLoc(), N->getValueType(0),
 994                            Arg.getOperand(0));
 995       }
 996       break;
 997     }
 998
 999   // (i32 fp_to_sint (fneg (select_cc f32, f32, 1.0, 0.0 cc))) ->
1000   // (i32 select_cc f32, f32, -1, 0 cc)
1001   //
1002   // Mesa's GLSL frontend generates the above pattern a lot and we can lower
1003   // this to one of the SET*_DX10 instructions.
1004   case ISD::FP_TO_SINT: {
1005     SDValue FNeg = N->getOperand(0);
1006     if (FNeg.getOpcode() != ISD::FNEG) {
1007       return SDValue();
1008     }
1009     SDValue SelectCC = FNeg.getOperand(0);
1010     if (SelectCC.getOpcode() != ISD::SELECT_CC ||
1011         SelectCC.getOperand(0).getValueType() != MVT::f32 || // LHS
1012         SelectCC.getOperand(2).getValueType() != MVT::f32 || // True
1013         !isHWTrueValue(SelectCC.getOperand(2)) ||
1014         !isHWFalseValue(SelectCC.getOperand(3))) {
1015       return SDValue();
1016     }
1017
1018     return DAG.getNode(ISD::SELECT_CC, N->getDebugLoc(), N->getValueType(0),
1019                            SelectCC.getOperand(0), // LHS
1020                            SelectCC.getOperand(1), // RHS
1021                            DAG.getConstant(-1, MVT::i32), // True
1022                            DAG.getConstant(0, MVT::i32),  // Flase
1023                            SelectCC.getOperand(4)); // CC
1024
1025     break;
1026   }
1027   // Extract_vec (Build_vector) generated by custom lowering
1028   // also needs to be customly combined
1029   case ISD::EXTRACT_VECTOR_ELT: {
1030     SDValue Arg = N->getOperand(0);
1031     if (Arg.getOpcode() == ISD::BUILD_VECTOR) {
1032       if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
1033         unsigned Element = Const->getZExtValue();
1034         return Arg->getOperand(Element);
1035       }
1036     }
1037     if (Arg.getOpcode() == ISD::BITCAST &&
1038         Arg.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
1039       if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
1040         unsigned Element = Const->getZExtValue();
1041         return DAG.getNode(ISD::BITCAST, N->getDebugLoc(), N->getVTList(),
1042             Arg->getOperand(0).getOperand(Element));
1043       }
1044     }
1045   }
1046
1047   case ISD::SELECT_CC: {
1048     // fold selectcc (selectcc x, y, a, b, cc), b, a, b, seteq ->
1049     //      selectcc x, y, a, b, inv(cc)
1050     //
1051     // fold selectcc (selectcc x, y, a, b, cc), b, a, b, setne ->
1052     //      selectcc x, y, a, b, cc
1053     SDValue LHS = N->getOperand(0);
1054     if (LHS.getOpcode() != ISD::SELECT_CC) {
1055       return SDValue();
1056     }
1057
1058     SDValue RHS = N->getOperand(1);
1059     SDValue True = N->getOperand(2);
1060     SDValue False = N->getOperand(3);
1061     ISD::CondCode NCC = cast<CondCodeSDNode>(N->getOperand(4))->get();
1062
1063     if (LHS.getOperand(2).getNode() != True.getNode() ||
1064         LHS.getOperand(3).getNode() != False.getNode() ||
1065         RHS.getNode() != False.getNode()) {
1066       return SDValue();
1067     }
1068
1069     switch (NCC) {
1070     default: return SDValue();
1071     case ISD::SETNE: return LHS;
1072     case ISD::SETEQ: {
1073       ISD::CondCode LHSCC = cast<CondCodeSDNode>(LHS.getOperand(4))->get();
1074       LHSCC = ISD::getSetCCInverse(LHSCC,
1075                                   LHS.getOperand(0).getValueType().isInteger());
1076       return DAG.getSelectCC(N->getDebugLoc(),
1077                              LHS.getOperand(0),
1078                              LHS.getOperand(1),
1079                              LHS.getOperand(2),
1080                              LHS.getOperand(3),
1081                              LHSCC);
1082     }
1083     }
1084   }
1085   case AMDGPUISD::EXPORT: {
1086     SDValue Arg = N->getOperand(1);
1087     if (Arg.getOpcode() != ISD::BUILD_VECTOR)
1088       break;
1089     SDValue NewBldVec[4] = {
1090         DAG.getUNDEF(MVT::f32),
1091         DAG.getUNDEF(MVT::f32),
1092         DAG.getUNDEF(MVT::f32),
1093         DAG.getUNDEF(MVT::f32)
1094       };
1095     SDValue NewArgs[8] = {
1096       N->getOperand(0), // Chain
1097       SDValue(),
1098       N->getOperand(2), // ArrayBase
1099       N->getOperand(3), // Type
1100       N->getOperand(4), // SWZ_X
1101       N->getOperand(5), // SWZ_Y
1102       N->getOperand(6), // SWZ_Z
1103       N->getOperand(7) // SWZ_W
1104     };
1105     for (unsigned i = 0; i < Arg.getNumOperands(); i++) {
1106       if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Arg.getOperand(i))) {
1107         if (C->isZero()) {
1108           NewArgs[4 + i] = DAG.getConstant(4, MVT::i32); // SEL_0
1109         } else if (C->isExactlyValue(1.0)) {
1110           NewArgs[4 + i] = DAG.getConstant(5, MVT::i32); // SEL_0
1111         } else {
1112           NewBldVec[i] = Arg.getOperand(i);
1113         }
1114       } else {
1115         NewBldVec[i] = Arg.getOperand(i);
1116       }
1117     }
1118     DebugLoc DL = N->getDebugLoc();
1119     NewArgs[1] = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v4f32, NewBldVec, 4);
1120     return DAG.getNode(AMDGPUISD::EXPORT, DL, N->getVTList(), NewArgs, 8);
1121   }
1122   }
1123   return SDValue();
1124 }