lib/Target/R600/R600ISelLowering.cpp

   1 //===-- R600ISelLowering.cpp - R600 DAG Lowering Implementation -----------===//
   2 //
   3 //                     The LLVM Compiler Infrastructure
   4 //
   5 // This file is distributed under the University of Illinois Open Source
   6 // License. See LICENSE.TXT for details.
   7 //
   8 //===----------------------------------------------------------------------===//
   9 //
  10 /// \file
  11 /// \brief Custom DAG lowering for R600
  12 //
  13 //===----------------------------------------------------------------------===//
  14
  15 #include "R600ISelLowering.h"
  16 #include "R600Defines.h"
  17 #include "R600InstrInfo.h"
  18 #include "R600MachineFunctionInfo.h"
  19 #include "llvm/Argument.h"
  20 #include "llvm/Function.h"
  21 #include "llvm/CodeGen/MachineInstrBuilder.h"
  22 #include "llvm/CodeGen/MachineRegisterInfo.h"
  23 #include "llvm/CodeGen/SelectionDAG.h"
  24
  25 using namespace llvm;
  26
  27 R600TargetLowering::R600TargetLowering(TargetMachine &TM) :
  28     AMDGPUTargetLowering(TM),
  29     TII(static_cast<const R600InstrInfo*>(TM.getInstrInfo())) {
  30   setOperationAction(ISD::MUL, MVT::i64, Expand);
  31   addRegisterClass(MVT::v4f32, &AMDGPU::R600_Reg128RegClass);
  32   addRegisterClass(MVT::f32, &AMDGPU::R600_Reg32RegClass);
  33   addRegisterClass(MVT::v4i32, &AMDGPU::R600_Reg128RegClass);
  34   addRegisterClass(MVT::i32, &AMDGPU::R600_Reg32RegClass);
  35   computeRegisterProperties();
  36
  37   setOperationAction(ISD::FADD, MVT::v4f32, Expand);
  38   setOperationAction(ISD::FMUL, MVT::v4f32, Expand);
  39   setOperationAction(ISD::FDIV, MVT::v4f32, Expand);
  40   setOperationAction(ISD::FSUB, MVT::v4f32, Expand);
  41
  42   setOperationAction(ISD::ADD,  MVT::v4i32, Expand);
  43   setOperationAction(ISD::AND,  MVT::v4i32, Expand);
  44   setOperationAction(ISD::UDIV, MVT::v4i32, Expand);
  45   setOperationAction(ISD::UREM, MVT::v4i32, Expand);
  46   setOperationAction(ISD::SETCC, MVT::v4i32, Expand);
  47
  48   setOperationAction(ISD::BR_CC, MVT::i32, Custom);
  49   setOperationAction(ISD::BR_CC, MVT::f32, Custom);
  50
  51   setOperationAction(ISD::FSUB, MVT::f32, Expand);
  52
  53   setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
  54   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
  55   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i1, Custom);
  56   setOperationAction(ISD::FPOW, MVT::f32, Custom);
  57
  58   setOperationAction(ISD::ROTL, MVT::i32, Custom);
  59
  60   setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
  61   setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
  62
  63   setOperationAction(ISD::SETCC, MVT::i32, Custom);
  64   setOperationAction(ISD::SETCC, MVT::f32, Custom);
  65   setOperationAction(ISD::FP_TO_UINT, MVT::i1, Custom);
  66
  67   setOperationAction(ISD::SELECT, MVT::i32, Custom);
  68   setOperationAction(ISD::SELECT, MVT::f32, Custom);
  69
  70   setOperationAction(ISD::STORE, MVT::i32, Custom);
  71   setOperationAction(ISD::STORE, MVT::v4i32, Custom);
  72
  73   setTargetDAGCombine(ISD::FP_ROUND);
  74
  75   setSchedulingPreference(Sched::VLIW);
  76 }
  77
  78 MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter(
  79     MachineInstr * MI, MachineBasicBlock * BB) const {
  80   MachineFunction * MF = BB->getParent();
  81   MachineRegisterInfo &MRI = MF->getRegInfo();
  82   MachineBasicBlock::iterator I = *MI;
  83
  84   switch (MI->getOpcode()) {
  85   default: return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
  86   case AMDGPU::SHADER_TYPE: break;
  87   case AMDGPU::CLAMP_R600: {
  88     MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
  89                                                    AMDGPU::MOV,
  90                                                    MI->getOperand(0).getReg(),
  91                                                    MI->getOperand(1).getReg());
  92     TII->addFlag(NewMI, 0, MO_FLAG_CLAMP);
  93     break;
  94   }
  95
  96   case AMDGPU::FABS_R600: {
  97     MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
  98                                                     AMDGPU::MOV,
  99                                                     MI->getOperand(0).getReg(),
 100                                                     MI->getOperand(1).getReg());
 101     TII->addFlag(NewMI, 0, MO_FLAG_ABS);
 102     break;
 103   }
 104
 105   case AMDGPU::FNEG_R600: {
 106     MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
 107                                                     AMDGPU::MOV,
 108                                                     MI->getOperand(0).getReg(),
 109                                                     MI->getOperand(1).getReg());
 110     TII->addFlag(NewMI, 0, MO_FLAG_NEG);
 111     break;
 112   }
 113
 114   case AMDGPU::R600_LOAD_CONST: {
 115     int64_t RegIndex = MI->getOperand(1).getImm();
 116     unsigned ConstantReg = AMDGPU::R600_CReg32RegClass.getRegister(RegIndex);
 117     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::COPY))
 118                 .addOperand(MI->getOperand(0))
 119                 .addReg(ConstantReg);
 120     break;
 121   }
 122
 123   case AMDGPU::MASK_WRITE: {
 124     unsigned maskedRegister = MI->getOperand(0).getReg();
 125     assert(TargetRegisterInfo::isVirtualRegister(maskedRegister));
 126     MachineInstr * defInstr = MRI.getVRegDef(maskedRegister);
 127     TII->addFlag(defInstr, 0, MO_FLAG_MASK);
 128     break;
 129   }
 130
 131   case AMDGPU::MOV_IMM_F32:
 132     TII->buildMovImm(*BB, I, MI->getOperand(0).getReg(),
 133                      MI->getOperand(1).getFPImm()->getValueAPF()
 134                          .bitcastToAPInt().getZExtValue());
 135     break;
 136   case AMDGPU::MOV_IMM_I32:
 137     TII->buildMovImm(*BB, I, MI->getOperand(0).getReg(),
 138                      MI->getOperand(1).getImm());
 139     break;
 140
 141
 142   case AMDGPU::RAT_WRITE_CACHELESS_32_eg:
 143   case AMDGPU::RAT_WRITE_CACHELESS_128_eg: {
 144     unsigned EOP = (llvm::next(I)->getOpcode() == AMDGPU::RETURN) ? 1 : 0;
 145
 146     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode()))
 147             .addOperand(MI->getOperand(0))
 148             .addOperand(MI->getOperand(1))
 149             .addImm(EOP); // Set End of program bit
 150     break;
 151   }
 152
 153   case AMDGPU::RESERVE_REG: {
 154     R600MachineFunctionInfo * MFI = MF->getInfo<R600MachineFunctionInfo>();
 155     int64_t ReservedIndex = MI->getOperand(0).getImm();
 156     unsigned ReservedReg =
 157                          AMDGPU::R600_TReg32RegClass.getRegister(ReservedIndex);
 158     MFI->ReservedRegs.push_back(ReservedReg);
 159     unsigned SuperReg =
 160           AMDGPU::R600_Reg128RegClass.getRegister(ReservedIndex / 4);
 161     MFI->ReservedRegs.push_back(SuperReg);
 162     break;
 163   }
 164
 165   case AMDGPU::TXD: {
 166     unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
 167     unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
 168
 169     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), T0)
 170             .addOperand(MI->getOperand(3))
 171             .addOperand(MI->getOperand(4))
 172             .addOperand(MI->getOperand(5))
 173             .addOperand(MI->getOperand(6));
 174     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), T1)
 175             .addOperand(MI->getOperand(2))
 176             .addOperand(MI->getOperand(4))
 177             .addOperand(MI->getOperand(5))
 178             .addOperand(MI->getOperand(6));
 179     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_G))
 180             .addOperand(MI->getOperand(0))
 181             .addOperand(MI->getOperand(1))
 182             .addOperand(MI->getOperand(4))
 183             .addOperand(MI->getOperand(5))
 184             .addOperand(MI->getOperand(6))
 185             .addReg(T0, RegState::Implicit)
 186             .addReg(T1, RegState::Implicit);
 187     break;
 188   }
 189
 190   case AMDGPU::TXD_SHADOW: {
 191     unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
 192     unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
 193
 194     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), T0)
 195             .addOperand(MI->getOperand(3))
 196             .addOperand(MI->getOperand(4))
 197             .addOperand(MI->getOperand(5))
 198             .addOperand(MI->getOperand(6));
 199     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), T1)
 200             .addOperand(MI->getOperand(2))
 201             .addOperand(MI->getOperand(4))
 202             .addOperand(MI->getOperand(5))
 203             .addOperand(MI->getOperand(6));
 204     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_C_G))
 205             .addOperand(MI->getOperand(0))
 206             .addOperand(MI->getOperand(1))
 207             .addOperand(MI->getOperand(4))
 208             .addOperand(MI->getOperand(5))
 209             .addOperand(MI->getOperand(6))
 210             .addReg(T0, RegState::Implicit)
 211             .addReg(T1, RegState::Implicit);
 212     break;
 213   }
 214
 215   case AMDGPU::BRANCH:
 216       BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP))
 217               .addOperand(MI->getOperand(0))
 218               .addReg(0);
 219       break;
 220
 221   case AMDGPU::BRANCH_COND_f32: {
 222     MachineInstr *NewMI =
 223       BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X),
 224               AMDGPU::PREDICATE_BIT)
 225               .addOperand(MI->getOperand(1))
 226               .addImm(OPCODE_IS_NOT_ZERO)
 227               .addImm(0); // Flags
 228     TII->addFlag(NewMI, 0, MO_FLAG_PUSH);
 229     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP))
 230             .addOperand(MI->getOperand(0))
 231             .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
 232     break;
 233   }
 234
 235   case AMDGPU::BRANCH_COND_i32: {
 236     MachineInstr *NewMI =
 237       BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X),
 238             AMDGPU::PREDICATE_BIT)
 239             .addOperand(MI->getOperand(1))
 240             .addImm(OPCODE_IS_NOT_ZERO_INT)
 241             .addImm(0); // Flags
 242     TII->addFlag(NewMI, 0, MO_FLAG_PUSH);
 243     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP))
 244            .addOperand(MI->getOperand(0))
 245             .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
 246     break;
 247   }
 248
 249   case AMDGPU::input_perspective: {
 250     R600MachineFunctionInfo *MFI = MF->getInfo<R600MachineFunctionInfo>();
 251
 252     // XXX Be more fine about register reservation
 253     for (unsigned i = 0; i < 4; i ++) {
 254       unsigned ReservedReg = AMDGPU::R600_TReg32RegClass.getRegister(i);
 255       MFI->ReservedRegs.push_back(ReservedReg);
 256     }
 257
 258     switch (MI->getOperand(1).getImm()) {
 259     case 0:// Perspective
 260       MFI->HasPerspectiveInterpolation = true;
 261       break;
 262     case 1:// Linear
 263       MFI->HasLinearInterpolation = true;
 264       break;
 265     default:
 266       assert(0 && "Unknow ij index");
 267     }
 268
 269     return BB;
 270   }
 271
 272   case AMDGPU::EG_ExportSwz:
 273   case AMDGPU::R600_ExportSwz: {
 274     bool EOP = (llvm::next(I)->getOpcode() == AMDGPU::RETURN)? 1 : 0;
 275     if (!EOP)
 276       return BB;
 277     unsigned CfInst = (MI->getOpcode() == AMDGPU::EG_ExportSwz)? 84 : 40;
 278     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode()))
 279             .addOperand(MI->getOperand(0))
 280             .addOperand(MI->getOperand(1))
 281             .addOperand(MI->getOperand(2))
 282             .addOperand(MI->getOperand(3))
 283             .addOperand(MI->getOperand(4))
 284             .addOperand(MI->getOperand(5))
 285             .addOperand(MI->getOperand(6))
 286             .addImm(CfInst)
 287             .addImm(1);
 288     break;
 289   }
 290   }
 291
 292   MI->eraseFromParent();
 293   return BB;
 294 }
 295
 296 //===----------------------------------------------------------------------===//
 297 // Custom DAG Lowering Operations
 298 //===----------------------------------------------------------------------===//
 299
 300 using namespace llvm::Intrinsic;
 301 using namespace llvm::AMDGPUIntrinsic;
 302
 303 static SDValue
 304 InsertScalarToRegisterExport(SelectionDAG &DAG, DebugLoc DL, SDNode **ExportMap,
 305     unsigned Slot, unsigned Channel, unsigned Inst, unsigned Type,
 306     SDValue Scalar, SDValue Chain) {
 307   if (!ExportMap[Slot]) {
 308     SDValue Vector = DAG.getNode(ISD::INSERT_VECTOR_ELT,
 309       DL, MVT::v4f32,
 310       DAG.getUNDEF(MVT::v4f32),
 311       Scalar,
 312       DAG.getConstant(Channel, MVT::i32));
 313
 314     unsigned Mask = 1 << Channel;
 315
 316     const SDValue Ops[] = {Chain, Vector, DAG.getConstant(Inst, MVT::i32),
 317         DAG.getConstant(Type, MVT::i32), DAG.getConstant(Slot, MVT::i32),
 318         DAG.getConstant(Mask, MVT::i32)};
 319
 320     SDValue Res =  DAG.getNode(
 321         AMDGPUISD::EXPORT,
 322         DL,
 323         MVT::Other,
 324         Ops, 6);
 325      ExportMap[Slot] = Res.getNode();
 326      return Res;
 327   }
 328
 329   SDNode *ExportInstruction = (SDNode *) ExportMap[Slot] ;
 330   SDValue PreviousVector = ExportInstruction->getOperand(1);
 331   SDValue Vector = DAG.getNode(ISD::INSERT_VECTOR_ELT,
 332       DL, MVT::v4f32,
 333       PreviousVector,
 334       Scalar,
 335       DAG.getConstant(Channel, MVT::i32));
 336
 337   unsigned Mask = dyn_cast<ConstantSDNode>(ExportInstruction->getOperand(5))
 338       ->getZExtValue();
 339   Mask |= (1 << Channel);
 340
 341   const SDValue Ops[] = {ExportInstruction->getOperand(0), Vector,
 342       DAG.getConstant(Inst, MVT::i32),
 343       DAG.getConstant(Type, MVT::i32),
 344       DAG.getConstant(Slot, MVT::i32),
 345       DAG.getConstant(Mask, MVT::i32)};
 346
 347   DAG.UpdateNodeOperands(ExportInstruction,
 348       Ops, 6);
 349
 350   return Chain;
 351
 352 }
 353
 354 SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
 355   switch (Op.getOpcode()) {
 356   default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
 357   case ISD::BR_CC: return LowerBR_CC(Op, DAG);
 358   case ISD::ROTL: return LowerROTL(Op, DAG);
 359   case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
 360   case ISD::SELECT: return LowerSELECT(Op, DAG);
 361   case ISD::SETCC: return LowerSETCC(Op, DAG);
 362   case ISD::STORE: return LowerSTORE(Op, DAG);
 363   case ISD::FPOW: return LowerFPOW(Op, DAG);
 364   case ISD::INTRINSIC_VOID: {
 365     SDValue Chain = Op.getOperand(0);
 366     unsigned IntrinsicID =
 367                          cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
 368     switch (IntrinsicID) {
 369     case AMDGPUIntrinsic::AMDGPU_store_output: {
 370       MachineFunction &MF = DAG.getMachineFunction();
 371       MachineRegisterInfo &MRI = MF.getRegInfo();
 372       int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
 373       unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister(RegIndex);
 374       if (!MRI.isLiveOut(Reg)) {
 375         MRI.addLiveOut(Reg);
 376       }
 377       return DAG.getCopyToReg(Chain, Op.getDebugLoc(), Reg, Op.getOperand(2));
 378     }
 379     case AMDGPUIntrinsic::R600_store_pixel_color: {
 380       MachineFunction &MF = DAG.getMachineFunction();
 381       R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
 382       int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
 383
 384       SDNode **OutputsMap = MFI->Outputs;
 385       return InsertScalarToRegisterExport(DAG, Op.getDebugLoc(), OutputsMap,
 386           RegIndex / 4, RegIndex % 4, 0, 0, Op.getOperand(2),
 387           Chain);
 388
 389     }
 390     case AMDGPUIntrinsic::R600_store_stream_output : {
 391       MachineFunction &MF = DAG.getMachineFunction();
 392       R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
 393       int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
 394       int64_t BufIndex = cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue();
 395
 396       SDNode **OutputsMap = MFI->StreamOutputs[BufIndex];
 397       unsigned Inst;
 398       switch (cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue()  ) {
 399       // STREAM3
 400       case 3:
 401         Inst = 4;
 402         break;
 403       // STREAM2
 404       case 2:
 405         Inst = 3;
 406         break;
 407       // STREAM1
 408       case 1:
 409         Inst = 2;
 410         break;
 411       // STREAM0
 412       case 0:
 413         Inst = 1;
 414         break;
 415       default:
 416         llvm_unreachable("Wrong buffer id for stream outputs !");
 417       }
 418
 419       return InsertScalarToRegisterExport(DAG, Op.getDebugLoc(), OutputsMap,
 420           RegIndex / 4, RegIndex % 4, Inst, 0, Op.getOperand(2),
 421           Chain);
 422     }
 423     // default for switch(IntrinsicID)
 424     default: break;
 425     }
 426     // break out of case ISD::INTRINSIC_VOID in switch(Op.getOpcode())
 427     break;
 428   }
 429   case ISD::INTRINSIC_WO_CHAIN: {
 430     unsigned IntrinsicID =
 431                          cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
 432     EVT VT = Op.getValueType();
 433     DebugLoc DL = Op.getDebugLoc();
 434     switch(IntrinsicID) {
 435     default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
 436     case AMDGPUIntrinsic::R600_load_input: {
 437       int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
 438       unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister(RegIndex);
 439       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass, Reg, VT);
 440     }
 441     case AMDGPUIntrinsic::R600_load_input_perspective: {
 442       int slot = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
 443       if (slot < 0)
 444         return DAG.getUNDEF(MVT::f32);
 445       SDValue FullVector = DAG.getNode(
 446           AMDGPUISD::INTERP,
 447           DL, MVT::v4f32,
 448           DAG.getConstant(0, MVT::i32), DAG.getConstant(slot / 4 , MVT::i32));
 449       return DAG.getNode(ISD::EXTRACT_VECTOR_ELT,
 450         DL, VT, FullVector, DAG.getConstant(slot % 4, MVT::i32));
 451     }
 452     case AMDGPUIntrinsic::R600_load_input_linear: {
 453       int slot = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
 454       if (slot < 0)
 455         return DAG.getUNDEF(MVT::f32);
 456       SDValue FullVector = DAG.getNode(
 457         AMDGPUISD::INTERP,
 458         DL, MVT::v4f32,
 459         DAG.getConstant(1, MVT::i32), DAG.getConstant(slot / 4 , MVT::i32));
 460       return DAG.getNode(ISD::EXTRACT_VECTOR_ELT,
 461         DL, VT, FullVector, DAG.getConstant(slot % 4, MVT::i32));
 462     }
 463     case AMDGPUIntrinsic::R600_load_input_constant: {
 464       int slot = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
 465       if (slot < 0)
 466         return DAG.getUNDEF(MVT::f32);
 467       SDValue FullVector = DAG.getNode(
 468         AMDGPUISD::INTERP_P0,
 469         DL, MVT::v4f32,
 470         DAG.getConstant(slot / 4 , MVT::i32));
 471       return DAG.getNode(ISD::EXTRACT_VECTOR_ELT,
 472           DL, VT, FullVector, DAG.getConstant(slot % 4, MVT::i32));
 473     }
 474
 475     case r600_read_ngroups_x:
 476       return LowerImplicitParameter(DAG, VT, DL, 0);
 477     case r600_read_ngroups_y:
 478       return LowerImplicitParameter(DAG, VT, DL, 1);
 479     case r600_read_ngroups_z:
 480       return LowerImplicitParameter(DAG, VT, DL, 2);
 481     case r600_read_global_size_x:
 482       return LowerImplicitParameter(DAG, VT, DL, 3);
 483     case r600_read_global_size_y:
 484       return LowerImplicitParameter(DAG, VT, DL, 4);
 485     case r600_read_global_size_z:
 486       return LowerImplicitParameter(DAG, VT, DL, 5);
 487     case r600_read_local_size_x:
 488       return LowerImplicitParameter(DAG, VT, DL, 6);
 489     case r600_read_local_size_y:
 490       return LowerImplicitParameter(DAG, VT, DL, 7);
 491     case r600_read_local_size_z:
 492       return LowerImplicitParameter(DAG, VT, DL, 8);
 493
 494     case r600_read_tgid_x:
 495       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 496                                   AMDGPU::T1_X, VT);
 497     case r600_read_tgid_y:
 498       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 499                                   AMDGPU::T1_Y, VT);
 500     case r600_read_tgid_z:
 501       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 502                                   AMDGPU::T1_Z, VT);
 503     case r600_read_tidig_x:
 504       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 505                                   AMDGPU::T0_X, VT);
 506     case r600_read_tidig_y:
 507       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 508                                   AMDGPU::T0_Y, VT);
 509     case r600_read_tidig_z:
 510       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 511                                   AMDGPU::T0_Z, VT);
 512     }
 513     // break out of case ISD::INTRINSIC_WO_CHAIN in switch(Op.getOpcode())
 514     break;
 515   }
 516   } // end switch(Op.getOpcode())
 517   return SDValue();
 518 }
 519
 520 void R600TargetLowering::ReplaceNodeResults(SDNode *N,
 521                                             SmallVectorImpl<SDValue> &Results,
 522                                             SelectionDAG &DAG) const {
 523   switch (N->getOpcode()) {
 524   default: return;
 525   case ISD::FP_TO_UINT: Results.push_back(LowerFPTOUINT(N->getOperand(0), DAG));
 526   }
 527 }
 528
 529 SDValue R600TargetLowering::LowerFPTOUINT(SDValue Op, SelectionDAG &DAG) const {
 530   return DAG.getNode(
 531       ISD::SETCC,
 532       Op.getDebugLoc(),
 533       MVT::i1,
 534       Op, DAG.getConstantFP(0.0f, MVT::f32),
 535       DAG.getCondCode(ISD::SETNE)
 536       );
 537 }
 538
 539 SDValue R600TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
 540   SDValue Chain = Op.getOperand(0);
 541   SDValue CC = Op.getOperand(1);
 542   SDValue LHS   = Op.getOperand(2);
 543   SDValue RHS   = Op.getOperand(3);
 544   SDValue JumpT  = Op.getOperand(4);
 545   SDValue CmpValue;
 546   SDValue Result;
 547
 548   if (LHS.getValueType() == MVT::i32) {
 549     CmpValue = DAG.getNode(
 550         ISD::SELECT_CC,
 551         Op.getDebugLoc(),
 552         MVT::i32,
 553         LHS, RHS,
 554         DAG.getConstant(-1, MVT::i32),
 555         DAG.getConstant(0, MVT::i32),
 556         CC);
 557   } else if (LHS.getValueType() == MVT::f32) {
 558     CmpValue = DAG.getNode(
 559         ISD::SELECT_CC,
 560         Op.getDebugLoc(),
 561         MVT::f32,
 562         LHS, RHS,
 563         DAG.getConstantFP(1.0f, MVT::f32),
 564         DAG.getConstantFP(0.0f, MVT::f32),
 565         CC);
 566   } else {
 567     assert(0 && "Not valid type for br_cc");
 568   }
 569   Result = DAG.getNode(
 570       AMDGPUISD::BRANCH_COND,
 571       CmpValue.getDebugLoc(),
 572       MVT::Other, Chain,
 573       JumpT, CmpValue);
 574   return Result;
 575 }
 576
 577 SDValue R600TargetLowering::LowerImplicitParameter(SelectionDAG &DAG, EVT VT,
 578                                                    DebugLoc DL,
 579                                                    unsigned DwordOffset) const {
 580   unsigned ByteOffset = DwordOffset * 4;
 581   PointerType * PtrType = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
 582                                       AMDGPUAS::PARAM_I_ADDRESS);
 583
 584   // We shouldn't be using an offset wider than 16-bits for implicit parameters.
 585   assert(isInt<16>(ByteOffset));
 586
 587   return DAG.getLoad(VT, DL, DAG.getEntryNode(),
 588                      DAG.getConstant(ByteOffset, MVT::i32), // PTR
 589                      MachinePointerInfo(ConstantPointerNull::get(PtrType)),
 590                      false, false, false, 0);
 591 }
 592
 593 SDValue R600TargetLowering::LowerROTL(SDValue Op, SelectionDAG &DAG) const {
 594   DebugLoc DL = Op.getDebugLoc();
 595   EVT VT = Op.getValueType();
 596
 597   return DAG.getNode(AMDGPUISD::BITALIGN, DL, VT,
 598                      Op.getOperand(0),
 599                      Op.getOperand(0),
 600                      DAG.getNode(ISD::SUB, DL, VT,
 601                                  DAG.getConstant(32, MVT::i32),
 602                                  Op.getOperand(1)));
 603 }
 604
 605 bool R600TargetLowering::isZero(SDValue Op) const {
 606   if(ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
 607     return Cst->isNullValue();
 608   } else if(ConstantFPSDNode *CstFP = dyn_cast<ConstantFPSDNode>(Op)){
 609     return CstFP->isZero();
 610   } else {
 611     return false;
 612   }
 613 }
 614
 615 SDValue R600TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
 616   DebugLoc DL = Op.getDebugLoc();
 617   EVT VT = Op.getValueType();
 618
 619   SDValue LHS = Op.getOperand(0);
 620   SDValue RHS = Op.getOperand(1);
 621   SDValue True = Op.getOperand(2);
 622   SDValue False = Op.getOperand(3);
 623   SDValue CC = Op.getOperand(4);
 624   SDValue Temp;
 625
 626   // LHS and RHS are guaranteed to be the same value type
 627   EVT CompareVT = LHS.getValueType();
 628
 629   // Check if we can lower this to a native operation.
 630
 631   // Try to lower to a CND* instruction:
 632   // CND* instructions requires RHS to be zero.  Some SELECT_CC nodes that
 633   // can be lowered to CND* instructions can also be lowered to SET*
 634   // instructions.  CND* instructions are cheaper, because they dont't
 635   // require additional instructions to convert their result to the correct
 636   // value type, so this check should be first.
 637   if (isZero(LHS) || isZero(RHS)) {
 638     SDValue Cond = (isZero(LHS) ? RHS : LHS);
 639     SDValue Zero = (isZero(LHS) ? LHS : RHS);
 640     ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
 641     if (CompareVT != VT) {
 642       // Bitcast True / False to the correct types.  This will end up being
 643       // a nop, but it allows us to define only a single pattern in the
 644       // .TD files for each CND* instruction rather than having to have
 645       // one pattern for integer True/False and one for fp True/False
 646       True = DAG.getNode(ISD::BITCAST, DL, CompareVT, True);
 647       False = DAG.getNode(ISD::BITCAST, DL, CompareVT, False);
 648     }
 649     if (isZero(LHS)) {
 650       CCOpcode = ISD::getSetCCSwappedOperands(CCOpcode);
 651     }
 652
 653     switch (CCOpcode) {
 654     case ISD::SETONE:
 655     case ISD::SETUNE:
 656     case ISD::SETNE:
 657     case ISD::SETULE:
 658     case ISD::SETULT:
 659     case ISD::SETOLE:
 660     case ISD::SETOLT:
 661     case ISD::SETLE:
 662     case ISD::SETLT:
 663       CCOpcode = ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32);
 664       Temp = True;
 665       True = False;
 666       False = Temp;
 667       break;
 668     default:
 669       break;
 670     }
 671     SDValue SelectNode = DAG.getNode(ISD::SELECT_CC, DL, CompareVT,
 672         Cond, Zero,
 673         True, False,
 674         DAG.getCondCode(CCOpcode));
 675     return DAG.getNode(ISD::BITCAST, DL, VT, SelectNode);
 676   }
 677
 678   // Try to lower to a SET* instruction:
 679   // We need all the operands of SELECT_CC to have the same value type, so if
 680   // necessary we need to change True and False to be the same type as LHS and
 681   // RHS, and then convert the result of the select_cc back to the correct type.
 682
 683   // Move hardware True/False values to the correct operand.
 684   if (isHWTrueValue(False) && isHWFalseValue(True)) {
 685     ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
 686     std::swap(False, True);
 687     CC = DAG.getCondCode(ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32));
 688   }
 689
 690   if (isHWTrueValue(True) && isHWFalseValue(False)) {
 691     if (CompareVT !=  VT) {
 692       if (VT == MVT::f32 && CompareVT == MVT::i32) {
 693         SDValue Boolean = DAG.getNode(ISD::SELECT_CC, DL, CompareVT,
 694             LHS, RHS,
 695             DAG.getConstant(-1, MVT::i32),
 696             DAG.getConstant(0, MVT::i32),
 697             CC);
 698         // Convert integer values of true (-1) and false (0) to fp values of
 699         // true (1.0f) and false (0.0f).
 700         SDValue LSB = DAG.getNode(ISD::AND, DL, MVT::i32, Boolean,
 701                                                   DAG.getConstant(1, MVT::i32));
 702         return DAG.getNode(ISD::UINT_TO_FP, DL, VT, LSB);
 703       } else if (VT == MVT::i32 && CompareVT == MVT::f32) {
 704         SDValue BoolAsFlt = DAG.getNode(ISD::SELECT_CC, DL, CompareVT,
 705             LHS, RHS,
 706             DAG.getConstantFP(1.0f, MVT::f32),
 707             DAG.getConstantFP(0.0f, MVT::f32),
 708             CC);
 709         // Convert fp values of true (1.0f) and false (0.0f) to integer values
 710         // of true (-1) and false (0).
 711         SDValue Neg = DAG.getNode(ISD::FNEG, DL, MVT::f32, BoolAsFlt);
 712         return DAG.getNode(ISD::FP_TO_SINT, DL, VT, Neg);
 713       } else {
 714         // I don't think there will be any other type pairings.
 715         assert(!"Unhandled operand type parings in SELECT_CC");
 716       }
 717     } else {
 718       // This SELECT_CC is already legal.
 719       return DAG.getNode(ISD::SELECT_CC, DL, VT, LHS, RHS, True, False, CC);
 720     }
 721   }
 722
 723   // Possible Min/Max pattern
 724   SDValue MinMax = LowerMinMax(Op, DAG);
 725   if (MinMax.getNode()) {
 726     return MinMax;
 727   }
 728
 729   // If we make it this for it means we have no native instructions to handle
 730   // this SELECT_CC, so we must lower it.
 731   SDValue HWTrue, HWFalse;
 732
 733   if (CompareVT == MVT::f32) {
 734     HWTrue = DAG.getConstantFP(1.0f, CompareVT);
 735     HWFalse = DAG.getConstantFP(0.0f, CompareVT);
 736   } else if (CompareVT == MVT::i32) {
 737     HWTrue = DAG.getConstant(-1, CompareVT);
 738     HWFalse = DAG.getConstant(0, CompareVT);
 739   }
 740   else {
 741     assert(!"Unhandled value type in LowerSELECT_CC");
 742   }
 743
 744   // Lower this unsupported SELECT_CC into a combination of two supported
 745   // SELECT_CC operations.
 746   SDValue Cond = DAG.getNode(ISD::SELECT_CC, DL, CompareVT, LHS, RHS, HWTrue, HWFalse, CC);
 747
 748   return DAG.getNode(ISD::SELECT_CC, DL, VT,
 749       Cond, HWFalse,
 750       True, False,
 751       DAG.getCondCode(ISD::SETNE));
 752 }
 753
 754 SDValue R600TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
 755   return DAG.getNode(ISD::SELECT_CC,
 756       Op.getDebugLoc(),
 757       Op.getValueType(),
 758       Op.getOperand(0),
 759       DAG.getConstant(0, MVT::i32),
 760       Op.getOperand(1),
 761       Op.getOperand(2),
 762       DAG.getCondCode(ISD::SETNE));
 763 }
 764
 765 SDValue R600TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
 766   SDValue Cond;
 767   SDValue LHS = Op.getOperand(0);
 768   SDValue RHS = Op.getOperand(1);
 769   SDValue CC  = Op.getOperand(2);
 770   DebugLoc DL = Op.getDebugLoc();
 771   assert(Op.getValueType() == MVT::i32);
 772   if (LHS.getValueType() == MVT::i32) {
 773     Cond = DAG.getNode(
 774         ISD::SELECT_CC,
 775         Op.getDebugLoc(),
 776         MVT::i32,
 777         LHS, RHS,
 778         DAG.getConstant(-1, MVT::i32),
 779         DAG.getConstant(0, MVT::i32),
 780         CC);
 781   } else if (LHS.getValueType() == MVT::f32) {
 782     Cond = DAG.getNode(
 783         ISD::SELECT_CC,
 784         Op.getDebugLoc(),
 785         MVT::f32,
 786         LHS, RHS,
 787         DAG.getConstantFP(1.0f, MVT::f32),
 788         DAG.getConstantFP(0.0f, MVT::f32),
 789         CC);
 790     Cond = DAG.getNode(
 791         ISD::FP_TO_SINT,
 792         DL,
 793         MVT::i32,
 794         Cond);
 795   } else {
 796     assert(0 && "Not valid type for set_cc");
 797   }
 798   Cond = DAG.getNode(
 799       ISD::AND,
 800       DL,
 801       MVT::i32,
 802       DAG.getConstant(1, MVT::i32),
 803       Cond);
 804   return Cond;
 805 }
 806
 807 SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
 808   DebugLoc DL = Op.getDebugLoc();
 809   StoreSDNode *StoreNode = cast<StoreSDNode>(Op);
 810   SDValue Chain = Op.getOperand(0);
 811   SDValue Value = Op.getOperand(1);
 812   SDValue Ptr = Op.getOperand(2);
 813
 814   if (StoreNode->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS &&
 815       Ptr->getOpcode() != AMDGPUISD::DWORDADDR) {
 816     // Convert pointer from byte address to dword address.
 817     Ptr = DAG.getNode(AMDGPUISD::DWORDADDR, DL, Ptr.getValueType(),
 818                       DAG.getNode(ISD::SRL, DL, Ptr.getValueType(),
 819                                   Ptr, DAG.getConstant(2, MVT::i32)));
 820
 821     if (StoreNode->isTruncatingStore() || StoreNode->isIndexed()) {
 822       assert(!"Truncated and indexed stores not supported yet");
 823     } else {
 824       Chain = DAG.getStore(Chain, DL, Value, Ptr, StoreNode->getMemOperand());
 825     }
 826     return Chain;
 827   }
 828   return SDValue();
 829 }
 830
 831
 832 SDValue R600TargetLowering::LowerFPOW(SDValue Op,
 833     SelectionDAG &DAG) const {
 834   DebugLoc DL = Op.getDebugLoc();
 835   EVT VT = Op.getValueType();
 836   SDValue LogBase = DAG.getNode(ISD::FLOG2, DL, VT, Op.getOperand(0));
 837   SDValue MulLogBase = DAG.getNode(ISD::FMUL, DL, VT, Op.getOperand(1), LogBase);
 838   return DAG.getNode(ISD::FEXP2, DL, VT, MulLogBase);
 839 }
 840
 841 /// XXX Only kernel functions are supported, so we can assume for now that
 842 /// every function is a kernel function, but in the future we should use
 843 /// separate calling conventions for kernel and non-kernel functions.
 844 SDValue R600TargetLowering::LowerFormalArguments(
 845                                       SDValue Chain,
 846                                       CallingConv::ID CallConv,
 847                                       bool isVarArg,
 848                                       const SmallVectorImpl<ISD::InputArg> &Ins,
 849                                       DebugLoc DL, SelectionDAG &DAG,
 850                                       SmallVectorImpl<SDValue> &InVals) const {
 851   unsigned ParamOffsetBytes = 36;
 852   Function::const_arg_iterator FuncArg =
 853                             DAG.getMachineFunction().getFunction()->arg_begin();
 854   for (unsigned i = 0, e = Ins.size(); i < e; ++i, ++FuncArg) {
 855     EVT VT = Ins[i].VT;
 856     Type *ArgType = FuncArg->getType();
 857     unsigned ArgSizeInBits = ArgType->isPointerTy() ?
 858                              32 : ArgType->getPrimitiveSizeInBits();
 859     unsigned ArgBytes = ArgSizeInBits >> 3;
 860     EVT ArgVT;
 861     if (ArgSizeInBits < VT.getSizeInBits()) {
 862       assert(!ArgType->isFloatTy() &&
 863              "Extending floating point arguments not supported yet");
 864       ArgVT = MVT::getIntegerVT(ArgSizeInBits);
 865     } else {
 866       ArgVT = VT;
 867     }
 868     PointerType *PtrTy = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
 869                                                     AMDGPUAS::PARAM_I_ADDRESS);
 870     SDValue Arg = DAG.getExtLoad(ISD::ZEXTLOAD, DL, VT, DAG.getRoot(),
 871                                 DAG.getConstant(ParamOffsetBytes, MVT::i32),
 872                                        MachinePointerInfo(new Argument(PtrTy)),
 873                                        ArgVT, false, false, ArgBytes);
 874     InVals.push_back(Arg);
 875     ParamOffsetBytes += ArgBytes;
 876   }
 877   return Chain;
 878 }
 879
 880 EVT R600TargetLowering::getSetCCResultType(EVT VT) const {
 881    if (!VT.isVector()) return MVT::i32;
 882    return VT.changeVectorElementTypeToInteger();
 883 }
 884
 885 //===----------------------------------------------------------------------===//
 886 // Custom DAG Optimizations
 887 //===----------------------------------------------------------------------===//
 888
 889 SDValue R600TargetLowering::PerformDAGCombine(SDNode *N,
 890                                               DAGCombinerInfo &DCI) const {
 891   SelectionDAG &DAG = DCI.DAG;
 892
 893   switch (N->getOpcode()) {
 894   // (f32 fp_round (f64 uint_to_fp a)) -> (f32 uint_to_fp a)
 895   case ISD::FP_ROUND: {
 896       SDValue Arg = N->getOperand(0);
 897       if (Arg.getOpcode() == ISD::UINT_TO_FP && Arg.getValueType() == MVT::f64) {
 898         return DAG.getNode(ISD::UINT_TO_FP, N->getDebugLoc(), N->getValueType(0),
 899                            Arg.getOperand(0));
 900       }
 901       break;
 902     }
 903   }
 904   return SDValue();
 905 }