lib/Target/R600/R600ISelLowering.cpp

   1 //===-- R600ISelLowering.cpp - R600 DAG Lowering Implementation -----------===//
   2 //
   3 //                     The LLVM Compiler Infrastructure
   4 //
   5 // This file is distributed under the University of Illinois Open Source
   6 // License. See LICENSE.TXT for details.
   7 //
   8 //===----------------------------------------------------------------------===//
   9 //
  10 /// \file
  11 /// \brief Custom DAG lowering for R600
  12 //
  13 //===----------------------------------------------------------------------===//
  14
  15 #include "R600ISelLowering.h"
  16 #include "R600Defines.h"
  17 #include "R600InstrInfo.h"
  18 #include "R600MachineFunctionInfo.h"
  19 #include "llvm/Argument.h"
  20 #include "llvm/Function.h"
  21 #include "llvm/CodeGen/MachineInstrBuilder.h"
  22 #include "llvm/CodeGen/MachineRegisterInfo.h"
  23 #include "llvm/CodeGen/SelectionDAG.h"
  24
  25 using namespace llvm;
  26
  27 R600TargetLowering::R600TargetLowering(TargetMachine &TM) :
  28     AMDGPUTargetLowering(TM),
  29     TII(static_cast<const R600InstrInfo*>(TM.getInstrInfo())) {
  30   setOperationAction(ISD::MUL, MVT::i64, Expand);
  31   addRegisterClass(MVT::v4f32, &AMDGPU::R600_Reg128RegClass);
  32   addRegisterClass(MVT::f32, &AMDGPU::R600_Reg32RegClass);
  33   addRegisterClass(MVT::v4i32, &AMDGPU::R600_Reg128RegClass);
  34   addRegisterClass(MVT::i32, &AMDGPU::R600_Reg32RegClass);
  35   computeRegisterProperties();
  36
  37   setOperationAction(ISD::FADD, MVT::v4f32, Expand);
  38   setOperationAction(ISD::FMUL, MVT::v4f32, Expand);
  39   setOperationAction(ISD::FDIV, MVT::v4f32, Expand);
  40   setOperationAction(ISD::FSUB, MVT::v4f32, Expand);
  41
  42   setOperationAction(ISD::ADD,  MVT::v4i32, Expand);
  43   setOperationAction(ISD::AND,  MVT::v4i32, Expand);
  44   setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Expand);
  45   setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Expand);
  46   setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Expand);
  47   setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Expand);
  48   setOperationAction(ISD::UDIV, MVT::v4i32, Expand);
  49   setOperationAction(ISD::UREM, MVT::v4i32, Expand);
  50   setOperationAction(ISD::SETCC, MVT::v4i32, Expand);
  51
  52   setOperationAction(ISD::BR_CC, MVT::i32, Custom);
  53   setOperationAction(ISD::BR_CC, MVT::f32, Custom);
  54
  55   setOperationAction(ISD::FSUB, MVT::f32, Expand);
  56
  57   setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
  58   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
  59   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i1, Custom);
  60   setOperationAction(ISD::FPOW, MVT::f32, Custom);
  61
  62   setOperationAction(ISD::ROTL, MVT::i32, Custom);
  63
  64   setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
  65   setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
  66
  67   setOperationAction(ISD::SETCC, MVT::i32, Custom);
  68   setOperationAction(ISD::SETCC, MVT::f32, Custom);
  69   setOperationAction(ISD::FP_TO_UINT, MVT::i1, Custom);
  70
  71   setOperationAction(ISD::SELECT, MVT::i32, Custom);
  72   setOperationAction(ISD::SELECT, MVT::f32, Custom);
  73
  74   setOperationAction(ISD::STORE, MVT::i32, Custom);
  75   setOperationAction(ISD::STORE, MVT::v4i32, Custom);
  76
  77   setTargetDAGCombine(ISD::FP_ROUND);
  78
  79   setSchedulingPreference(Sched::VLIW);
  80 }
  81
  82 MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter(
  83     MachineInstr * MI, MachineBasicBlock * BB) const {
  84   MachineFunction * MF = BB->getParent();
  85   MachineRegisterInfo &MRI = MF->getRegInfo();
  86   MachineBasicBlock::iterator I = *MI;
  87
  88   switch (MI->getOpcode()) {
  89   default: return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
  90   case AMDGPU::SHADER_TYPE: break;
  91   case AMDGPU::CLAMP_R600: {
  92     MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
  93                                                    AMDGPU::MOV,
  94                                                    MI->getOperand(0).getReg(),
  95                                                    MI->getOperand(1).getReg());
  96     TII->addFlag(NewMI, 0, MO_FLAG_CLAMP);
  97     break;
  98   }
  99
 100   case AMDGPU::FABS_R600: {
 101     MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
 102                                                     AMDGPU::MOV,
 103                                                     MI->getOperand(0).getReg(),
 104                                                     MI->getOperand(1).getReg());
 105     TII->addFlag(NewMI, 0, MO_FLAG_ABS);
 106     break;
 107   }
 108
 109   case AMDGPU::FNEG_R600: {
 110     MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
 111                                                     AMDGPU::MOV,
 112                                                     MI->getOperand(0).getReg(),
 113                                                     MI->getOperand(1).getReg());
 114     TII->addFlag(NewMI, 0, MO_FLAG_NEG);
 115     break;
 116   }
 117
 118   case AMDGPU::R600_LOAD_CONST: {
 119     int64_t RegIndex = MI->getOperand(1).getImm();
 120     unsigned ConstantReg = AMDGPU::R600_CReg32RegClass.getRegister(RegIndex);
 121     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::COPY))
 122                 .addOperand(MI->getOperand(0))
 123                 .addReg(ConstantReg);
 124     break;
 125   }
 126
 127   case AMDGPU::MASK_WRITE: {
 128     unsigned maskedRegister = MI->getOperand(0).getReg();
 129     assert(TargetRegisterInfo::isVirtualRegister(maskedRegister));
 130     MachineInstr * defInstr = MRI.getVRegDef(maskedRegister);
 131     TII->addFlag(defInstr, 0, MO_FLAG_MASK);
 132     break;
 133   }
 134
 135   case AMDGPU::MOV_IMM_F32:
 136     TII->buildMovImm(*BB, I, MI->getOperand(0).getReg(),
 137                      MI->getOperand(1).getFPImm()->getValueAPF()
 138                          .bitcastToAPInt().getZExtValue());
 139     break;
 140   case AMDGPU::MOV_IMM_I32:
 141     TII->buildMovImm(*BB, I, MI->getOperand(0).getReg(),
 142                      MI->getOperand(1).getImm());
 143     break;
 144
 145
 146   case AMDGPU::RAT_WRITE_CACHELESS_32_eg:
 147   case AMDGPU::RAT_WRITE_CACHELESS_128_eg: {
 148     unsigned EOP = (llvm::next(I)->getOpcode() == AMDGPU::RETURN) ? 1 : 0;
 149
 150     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode()))
 151             .addOperand(MI->getOperand(0))
 152             .addOperand(MI->getOperand(1))
 153             .addImm(EOP); // Set End of program bit
 154     break;
 155   }
 156
 157   case AMDGPU::RESERVE_REG: {
 158     R600MachineFunctionInfo * MFI = MF->getInfo<R600MachineFunctionInfo>();
 159     int64_t ReservedIndex = MI->getOperand(0).getImm();
 160     unsigned ReservedReg =
 161                          AMDGPU::R600_TReg32RegClass.getRegister(ReservedIndex);
 162     MFI->ReservedRegs.push_back(ReservedReg);
 163     unsigned SuperReg =
 164           AMDGPU::R600_Reg128RegClass.getRegister(ReservedIndex / 4);
 165     MFI->ReservedRegs.push_back(SuperReg);
 166     break;
 167   }
 168
 169   case AMDGPU::TXD: {
 170     unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
 171     unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
 172
 173     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), T0)
 174             .addOperand(MI->getOperand(3))
 175             .addOperand(MI->getOperand(4))
 176             .addOperand(MI->getOperand(5))
 177             .addOperand(MI->getOperand(6));
 178     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), T1)
 179             .addOperand(MI->getOperand(2))
 180             .addOperand(MI->getOperand(4))
 181             .addOperand(MI->getOperand(5))
 182             .addOperand(MI->getOperand(6));
 183     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_G))
 184             .addOperand(MI->getOperand(0))
 185             .addOperand(MI->getOperand(1))
 186             .addOperand(MI->getOperand(4))
 187             .addOperand(MI->getOperand(5))
 188             .addOperand(MI->getOperand(6))
 189             .addReg(T0, RegState::Implicit)
 190             .addReg(T1, RegState::Implicit);
 191     break;
 192   }
 193
 194   case AMDGPU::TXD_SHADOW: {
 195     unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
 196     unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
 197
 198     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), T0)
 199             .addOperand(MI->getOperand(3))
 200             .addOperand(MI->getOperand(4))
 201             .addOperand(MI->getOperand(5))
 202             .addOperand(MI->getOperand(6));
 203     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), T1)
 204             .addOperand(MI->getOperand(2))
 205             .addOperand(MI->getOperand(4))
 206             .addOperand(MI->getOperand(5))
 207             .addOperand(MI->getOperand(6));
 208     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_C_G))
 209             .addOperand(MI->getOperand(0))
 210             .addOperand(MI->getOperand(1))
 211             .addOperand(MI->getOperand(4))
 212             .addOperand(MI->getOperand(5))
 213             .addOperand(MI->getOperand(6))
 214             .addReg(T0, RegState::Implicit)
 215             .addReg(T1, RegState::Implicit);
 216     break;
 217   }
 218
 219   case AMDGPU::BRANCH:
 220       BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP))
 221               .addOperand(MI->getOperand(0))
 222               .addReg(0);
 223       break;
 224
 225   case AMDGPU::BRANCH_COND_f32: {
 226     MachineInstr *NewMI =
 227       BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X),
 228               AMDGPU::PREDICATE_BIT)
 229               .addOperand(MI->getOperand(1))
 230               .addImm(OPCODE_IS_NOT_ZERO)
 231               .addImm(0); // Flags
 232     TII->addFlag(NewMI, 0, MO_FLAG_PUSH);
 233     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP))
 234             .addOperand(MI->getOperand(0))
 235             .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
 236     break;
 237   }
 238
 239   case AMDGPU::BRANCH_COND_i32: {
 240     MachineInstr *NewMI =
 241       BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X),
 242             AMDGPU::PREDICATE_BIT)
 243             .addOperand(MI->getOperand(1))
 244             .addImm(OPCODE_IS_NOT_ZERO_INT)
 245             .addImm(0); // Flags
 246     TII->addFlag(NewMI, 0, MO_FLAG_PUSH);
 247     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP))
 248            .addOperand(MI->getOperand(0))
 249             .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
 250     break;
 251   }
 252
 253   case AMDGPU::input_perspective: {
 254     R600MachineFunctionInfo *MFI = MF->getInfo<R600MachineFunctionInfo>();
 255
 256     // XXX Be more fine about register reservation
 257     for (unsigned i = 0; i < 4; i ++) {
 258       unsigned ReservedReg = AMDGPU::R600_TReg32RegClass.getRegister(i);
 259       MFI->ReservedRegs.push_back(ReservedReg);
 260     }
 261
 262     switch (MI->getOperand(1).getImm()) {
 263     case 0:// Perspective
 264       MFI->HasPerspectiveInterpolation = true;
 265       break;
 266     case 1:// Linear
 267       MFI->HasLinearInterpolation = true;
 268       break;
 269     default:
 270       assert(0 && "Unknow ij index");
 271     }
 272
 273     return BB;
 274   }
 275
 276   case AMDGPU::EG_ExportSwz:
 277   case AMDGPU::R600_ExportSwz: {
 278     bool EOP = (llvm::next(I)->getOpcode() == AMDGPU::RETURN)? 1 : 0;
 279     if (!EOP)
 280       return BB;
 281     unsigned CfInst = (MI->getOpcode() == AMDGPU::EG_ExportSwz)? 84 : 40;
 282     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode()))
 283             .addOperand(MI->getOperand(0))
 284             .addOperand(MI->getOperand(1))
 285             .addOperand(MI->getOperand(2))
 286             .addOperand(MI->getOperand(3))
 287             .addOperand(MI->getOperand(4))
 288             .addOperand(MI->getOperand(5))
 289             .addOperand(MI->getOperand(6))
 290             .addImm(CfInst)
 291             .addImm(1);
 292     break;
 293   }
 294   }
 295
 296   MI->eraseFromParent();
 297   return BB;
 298 }
 299
 300 //===----------------------------------------------------------------------===//
 301 // Custom DAG Lowering Operations
 302 //===----------------------------------------------------------------------===//
 303
 304 using namespace llvm::Intrinsic;
 305 using namespace llvm::AMDGPUIntrinsic;
 306
 307 static SDValue
 308 InsertScalarToRegisterExport(SelectionDAG &DAG, DebugLoc DL, SDNode **ExportMap,
 309     unsigned Slot, unsigned Channel, unsigned Inst, unsigned Type,
 310     SDValue Scalar, SDValue Chain) {
 311   if (!ExportMap[Slot]) {
 312     SDValue Vector = DAG.getNode(ISD::INSERT_VECTOR_ELT,
 313       DL, MVT::v4f32,
 314       DAG.getUNDEF(MVT::v4f32),
 315       Scalar,
 316       DAG.getConstant(Channel, MVT::i32));
 317
 318     unsigned Mask = 1 << Channel;
 319
 320     const SDValue Ops[] = {Chain, Vector, DAG.getConstant(Inst, MVT::i32),
 321         DAG.getConstant(Type, MVT::i32), DAG.getConstant(Slot, MVT::i32),
 322         DAG.getConstant(Mask, MVT::i32)};
 323
 324     SDValue Res =  DAG.getNode(
 325         AMDGPUISD::EXPORT,
 326         DL,
 327         MVT::Other,
 328         Ops, 6);
 329      ExportMap[Slot] = Res.getNode();
 330      return Res;
 331   }
 332
 333   SDNode *ExportInstruction = (SDNode *) ExportMap[Slot] ;
 334   SDValue PreviousVector = ExportInstruction->getOperand(1);
 335   SDValue Vector = DAG.getNode(ISD::INSERT_VECTOR_ELT,
 336       DL, MVT::v4f32,
 337       PreviousVector,
 338       Scalar,
 339       DAG.getConstant(Channel, MVT::i32));
 340
 341   unsigned Mask = dyn_cast<ConstantSDNode>(ExportInstruction->getOperand(5))
 342       ->getZExtValue();
 343   Mask |= (1 << Channel);
 344
 345   const SDValue Ops[] = {ExportInstruction->getOperand(0), Vector,
 346       DAG.getConstant(Inst, MVT::i32),
 347       DAG.getConstant(Type, MVT::i32),
 348       DAG.getConstant(Slot, MVT::i32),
 349       DAG.getConstant(Mask, MVT::i32)};
 350
 351   DAG.UpdateNodeOperands(ExportInstruction,
 352       Ops, 6);
 353
 354   return Chain;
 355
 356 }
 357
 358 SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
 359   switch (Op.getOpcode()) {
 360   default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
 361   case ISD::BR_CC: return LowerBR_CC(Op, DAG);
 362   case ISD::ROTL: return LowerROTL(Op, DAG);
 363   case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
 364   case ISD::SELECT: return LowerSELECT(Op, DAG);
 365   case ISD::SETCC: return LowerSETCC(Op, DAG);
 366   case ISD::STORE: return LowerSTORE(Op, DAG);
 367   case ISD::FPOW: return LowerFPOW(Op, DAG);
 368   case ISD::INTRINSIC_VOID: {
 369     SDValue Chain = Op.getOperand(0);
 370     unsigned IntrinsicID =
 371                          cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
 372     switch (IntrinsicID) {
 373     case AMDGPUIntrinsic::AMDGPU_store_output: {
 374       MachineFunction &MF = DAG.getMachineFunction();
 375       MachineRegisterInfo &MRI = MF.getRegInfo();
 376       int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
 377       unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister(RegIndex);
 378       if (!MRI.isLiveOut(Reg)) {
 379         MRI.addLiveOut(Reg);
 380       }
 381       return DAG.getCopyToReg(Chain, Op.getDebugLoc(), Reg, Op.getOperand(2));
 382     }
 383     case AMDGPUIntrinsic::R600_store_pixel_color: {
 384       MachineFunction &MF = DAG.getMachineFunction();
 385       R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
 386       int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
 387
 388       SDNode **OutputsMap = MFI->Outputs;
 389       return InsertScalarToRegisterExport(DAG, Op.getDebugLoc(), OutputsMap,
 390           RegIndex / 4, RegIndex % 4, 0, 0, Op.getOperand(2),
 391           Chain);
 392
 393     }
 394     case AMDGPUIntrinsic::R600_store_stream_output : {
 395       MachineFunction &MF = DAG.getMachineFunction();
 396       R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
 397       int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
 398       int64_t BufIndex = cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue();
 399
 400       SDNode **OutputsMap = MFI->StreamOutputs[BufIndex];
 401       unsigned Inst;
 402       switch (cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue()  ) {
 403       // STREAM3
 404       case 3:
 405         Inst = 4;
 406         break;
 407       // STREAM2
 408       case 2:
 409         Inst = 3;
 410         break;
 411       // STREAM1
 412       case 1:
 413         Inst = 2;
 414         break;
 415       // STREAM0
 416       case 0:
 417         Inst = 1;
 418         break;
 419       default:
 420         llvm_unreachable("Wrong buffer id for stream outputs !");
 421       }
 422
 423       return InsertScalarToRegisterExport(DAG, Op.getDebugLoc(), OutputsMap,
 424           RegIndex / 4, RegIndex % 4, Inst, 0, Op.getOperand(2),
 425           Chain);
 426     }
 427     // default for switch(IntrinsicID)
 428     default: break;
 429     }
 430     // break out of case ISD::INTRINSIC_VOID in switch(Op.getOpcode())
 431     break;
 432   }
 433   case ISD::INTRINSIC_WO_CHAIN: {
 434     unsigned IntrinsicID =
 435                          cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
 436     EVT VT = Op.getValueType();
 437     DebugLoc DL = Op.getDebugLoc();
 438     switch(IntrinsicID) {
 439     default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
 440     case AMDGPUIntrinsic::R600_load_input: {
 441       int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
 442       unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister(RegIndex);
 443       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass, Reg, VT);
 444     }
 445     case AMDGPUIntrinsic::R600_load_input_perspective: {
 446       int slot = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
 447       if (slot < 0)
 448         return DAG.getUNDEF(MVT::f32);
 449       SDValue FullVector = DAG.getNode(
 450           AMDGPUISD::INTERP,
 451           DL, MVT::v4f32,
 452           DAG.getConstant(0, MVT::i32), DAG.getConstant(slot / 4 , MVT::i32));
 453       return DAG.getNode(ISD::EXTRACT_VECTOR_ELT,
 454         DL, VT, FullVector, DAG.getConstant(slot % 4, MVT::i32));
 455     }
 456     case AMDGPUIntrinsic::R600_load_input_linear: {
 457       int slot = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
 458       if (slot < 0)
 459         return DAG.getUNDEF(MVT::f32);
 460       SDValue FullVector = DAG.getNode(
 461         AMDGPUISD::INTERP,
 462         DL, MVT::v4f32,
 463         DAG.getConstant(1, MVT::i32), DAG.getConstant(slot / 4 , MVT::i32));
 464       return DAG.getNode(ISD::EXTRACT_VECTOR_ELT,
 465         DL, VT, FullVector, DAG.getConstant(slot % 4, MVT::i32));
 466     }
 467     case AMDGPUIntrinsic::R600_load_input_constant: {
 468       int slot = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
 469       if (slot < 0)
 470         return DAG.getUNDEF(MVT::f32);
 471       SDValue FullVector = DAG.getNode(
 472         AMDGPUISD::INTERP_P0,
 473         DL, MVT::v4f32,
 474         DAG.getConstant(slot / 4 , MVT::i32));
 475       return DAG.getNode(ISD::EXTRACT_VECTOR_ELT,
 476           DL, VT, FullVector, DAG.getConstant(slot % 4, MVT::i32));
 477     }
 478
 479     case r600_read_ngroups_x:
 480       return LowerImplicitParameter(DAG, VT, DL, 0);
 481     case r600_read_ngroups_y:
 482       return LowerImplicitParameter(DAG, VT, DL, 1);
 483     case r600_read_ngroups_z:
 484       return LowerImplicitParameter(DAG, VT, DL, 2);
 485     case r600_read_global_size_x:
 486       return LowerImplicitParameter(DAG, VT, DL, 3);
 487     case r600_read_global_size_y:
 488       return LowerImplicitParameter(DAG, VT, DL, 4);
 489     case r600_read_global_size_z:
 490       return LowerImplicitParameter(DAG, VT, DL, 5);
 491     case r600_read_local_size_x:
 492       return LowerImplicitParameter(DAG, VT, DL, 6);
 493     case r600_read_local_size_y:
 494       return LowerImplicitParameter(DAG, VT, DL, 7);
 495     case r600_read_local_size_z:
 496       return LowerImplicitParameter(DAG, VT, DL, 8);
 497
 498     case r600_read_tgid_x:
 499       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 500                                   AMDGPU::T1_X, VT);
 501     case r600_read_tgid_y:
 502       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 503                                   AMDGPU::T1_Y, VT);
 504     case r600_read_tgid_z:
 505       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 506                                   AMDGPU::T1_Z, VT);
 507     case r600_read_tidig_x:
 508       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 509                                   AMDGPU::T0_X, VT);
 510     case r600_read_tidig_y:
 511       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 512                                   AMDGPU::T0_Y, VT);
 513     case r600_read_tidig_z:
 514       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 515                                   AMDGPU::T0_Z, VT);
 516     }
 517     // break out of case ISD::INTRINSIC_WO_CHAIN in switch(Op.getOpcode())
 518     break;
 519   }
 520   } // end switch(Op.getOpcode())
 521   return SDValue();
 522 }
 523
 524 void R600TargetLowering::ReplaceNodeResults(SDNode *N,
 525                                             SmallVectorImpl<SDValue> &Results,
 526                                             SelectionDAG &DAG) const {
 527   switch (N->getOpcode()) {
 528   default: return;
 529   case ISD::FP_TO_UINT: Results.push_back(LowerFPTOUINT(N->getOperand(0), DAG));
 530   }
 531 }
 532
 533 SDValue R600TargetLowering::LowerFPTOUINT(SDValue Op, SelectionDAG &DAG) const {
 534   return DAG.getNode(
 535       ISD::SETCC,
 536       Op.getDebugLoc(),
 537       MVT::i1,
 538       Op, DAG.getConstantFP(0.0f, MVT::f32),
 539       DAG.getCondCode(ISD::SETNE)
 540       );
 541 }
 542
 543 SDValue R600TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
 544   SDValue Chain = Op.getOperand(0);
 545   SDValue CC = Op.getOperand(1);
 546   SDValue LHS   = Op.getOperand(2);
 547   SDValue RHS   = Op.getOperand(3);
 548   SDValue JumpT  = Op.getOperand(4);
 549   SDValue CmpValue;
 550   SDValue Result;
 551
 552   if (LHS.getValueType() == MVT::i32) {
 553     CmpValue = DAG.getNode(
 554         ISD::SELECT_CC,
 555         Op.getDebugLoc(),
 556         MVT::i32,
 557         LHS, RHS,
 558         DAG.getConstant(-1, MVT::i32),
 559         DAG.getConstant(0, MVT::i32),
 560         CC);
 561   } else if (LHS.getValueType() == MVT::f32) {
 562     CmpValue = DAG.getNode(
 563         ISD::SELECT_CC,
 564         Op.getDebugLoc(),
 565         MVT::f32,
 566         LHS, RHS,
 567         DAG.getConstantFP(1.0f, MVT::f32),
 568         DAG.getConstantFP(0.0f, MVT::f32),
 569         CC);
 570   } else {
 571     assert(0 && "Not valid type for br_cc");
 572   }
 573   Result = DAG.getNode(
 574       AMDGPUISD::BRANCH_COND,
 575       CmpValue.getDebugLoc(),
 576       MVT::Other, Chain,
 577       JumpT, CmpValue);
 578   return Result;
 579 }
 580
 581 SDValue R600TargetLowering::LowerImplicitParameter(SelectionDAG &DAG, EVT VT,
 582                                                    DebugLoc DL,
 583                                                    unsigned DwordOffset) const {
 584   unsigned ByteOffset = DwordOffset * 4;
 585   PointerType * PtrType = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
 586                                       AMDGPUAS::PARAM_I_ADDRESS);
 587
 588   // We shouldn't be using an offset wider than 16-bits for implicit parameters.
 589   assert(isInt<16>(ByteOffset));
 590
 591   return DAG.getLoad(VT, DL, DAG.getEntryNode(),
 592                      DAG.getConstant(ByteOffset, MVT::i32), // PTR
 593                      MachinePointerInfo(ConstantPointerNull::get(PtrType)),
 594                      false, false, false, 0);
 595 }
 596
 597 SDValue R600TargetLowering::LowerROTL(SDValue Op, SelectionDAG &DAG) const {
 598   DebugLoc DL = Op.getDebugLoc();
 599   EVT VT = Op.getValueType();
 600
 601   return DAG.getNode(AMDGPUISD::BITALIGN, DL, VT,
 602                      Op.getOperand(0),
 603                      Op.getOperand(0),
 604                      DAG.getNode(ISD::SUB, DL, VT,
 605                                  DAG.getConstant(32, MVT::i32),
 606                                  Op.getOperand(1)));
 607 }
 608
 609 bool R600TargetLowering::isZero(SDValue Op) const {
 610   if(ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
 611     return Cst->isNullValue();
 612   } else if(ConstantFPSDNode *CstFP = dyn_cast<ConstantFPSDNode>(Op)){
 613     return CstFP->isZero();
 614   } else {
 615     return false;
 616   }
 617 }
 618
 619 SDValue R600TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
 620   DebugLoc DL = Op.getDebugLoc();
 621   EVT VT = Op.getValueType();
 622
 623   SDValue LHS = Op.getOperand(0);
 624   SDValue RHS = Op.getOperand(1);
 625   SDValue True = Op.getOperand(2);
 626   SDValue False = Op.getOperand(3);
 627   SDValue CC = Op.getOperand(4);
 628   SDValue Temp;
 629
 630   // LHS and RHS are guaranteed to be the same value type
 631   EVT CompareVT = LHS.getValueType();
 632
 633   // Check if we can lower this to a native operation.
 634
 635   // Try to lower to a CND* instruction:
 636   // CND* instructions requires RHS to be zero.  Some SELECT_CC nodes that
 637   // can be lowered to CND* instructions can also be lowered to SET*
 638   // instructions.  CND* instructions are cheaper, because they dont't
 639   // require additional instructions to convert their result to the correct
 640   // value type, so this check should be first.
 641   if (isZero(LHS) || isZero(RHS)) {
 642     SDValue Cond = (isZero(LHS) ? RHS : LHS);
 643     SDValue Zero = (isZero(LHS) ? LHS : RHS);
 644     ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
 645     if (CompareVT != VT) {
 646       // Bitcast True / False to the correct types.  This will end up being
 647       // a nop, but it allows us to define only a single pattern in the
 648       // .TD files for each CND* instruction rather than having to have
 649       // one pattern for integer True/False and one for fp True/False
 650       True = DAG.getNode(ISD::BITCAST, DL, CompareVT, True);
 651       False = DAG.getNode(ISD::BITCAST, DL, CompareVT, False);
 652     }
 653     if (isZero(LHS)) {
 654       CCOpcode = ISD::getSetCCSwappedOperands(CCOpcode);
 655     }
 656
 657     switch (CCOpcode) {
 658     case ISD::SETONE:
 659     case ISD::SETUNE:
 660     case ISD::SETNE:
 661     case ISD::SETULE:
 662     case ISD::SETULT:
 663     case ISD::SETOLE:
 664     case ISD::SETOLT:
 665     case ISD::SETLE:
 666     case ISD::SETLT:
 667       CCOpcode = ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32);
 668       Temp = True;
 669       True = False;
 670       False = Temp;
 671       break;
 672     default:
 673       break;
 674     }
 675     SDValue SelectNode = DAG.getNode(ISD::SELECT_CC, DL, CompareVT,
 676         Cond, Zero,
 677         True, False,
 678         DAG.getCondCode(CCOpcode));
 679     return DAG.getNode(ISD::BITCAST, DL, VT, SelectNode);
 680   }
 681
 682   // Try to lower to a SET* instruction:
 683   // We need all the operands of SELECT_CC to have the same value type, so if
 684   // necessary we need to change True and False to be the same type as LHS and
 685   // RHS, and then convert the result of the select_cc back to the correct type.
 686
 687   // Move hardware True/False values to the correct operand.
 688   if (isHWTrueValue(False) && isHWFalseValue(True)) {
 689     ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
 690     std::swap(False, True);
 691     CC = DAG.getCondCode(ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32));
 692   }
 693
 694   if (isHWTrueValue(True) && isHWFalseValue(False)) {
 695     if (CompareVT !=  VT) {
 696       if (VT == MVT::f32 && CompareVT == MVT::i32) {
 697         SDValue Boolean = DAG.getNode(ISD::SELECT_CC, DL, CompareVT,
 698             LHS, RHS,
 699             DAG.getConstant(-1, MVT::i32),
 700             DAG.getConstant(0, MVT::i32),
 701             CC);
 702         // Convert integer values of true (-1) and false (0) to fp values of
 703         // true (1.0f) and false (0.0f).
 704         SDValue LSB = DAG.getNode(ISD::AND, DL, MVT::i32, Boolean,
 705                                                   DAG.getConstant(1, MVT::i32));
 706         return DAG.getNode(ISD::UINT_TO_FP, DL, VT, LSB);
 707       } else if (VT == MVT::i32 && CompareVT == MVT::f32) {
 708         SDValue BoolAsFlt = DAG.getNode(ISD::SELECT_CC, DL, CompareVT,
 709             LHS, RHS,
 710             DAG.getConstantFP(1.0f, MVT::f32),
 711             DAG.getConstantFP(0.0f, MVT::f32),
 712             CC);
 713         // Convert fp values of true (1.0f) and false (0.0f) to integer values
 714         // of true (-1) and false (0).
 715         SDValue Neg = DAG.getNode(ISD::FNEG, DL, MVT::f32, BoolAsFlt);
 716         return DAG.getNode(ISD::FP_TO_SINT, DL, VT, Neg);
 717       } else {
 718         // I don't think there will be any other type pairings.
 719         assert(!"Unhandled operand type parings in SELECT_CC");
 720       }
 721     } else {
 722       // This SELECT_CC is already legal.
 723       return DAG.getNode(ISD::SELECT_CC, DL, VT, LHS, RHS, True, False, CC);
 724     }
 725   }
 726
 727   // Possible Min/Max pattern
 728   SDValue MinMax = LowerMinMax(Op, DAG);
 729   if (MinMax.getNode()) {
 730     return MinMax;
 731   }
 732
 733   // If we make it this for it means we have no native instructions to handle
 734   // this SELECT_CC, so we must lower it.
 735   SDValue HWTrue, HWFalse;
 736
 737   if (CompareVT == MVT::f32) {
 738     HWTrue = DAG.getConstantFP(1.0f, CompareVT);
 739     HWFalse = DAG.getConstantFP(0.0f, CompareVT);
 740   } else if (CompareVT == MVT::i32) {
 741     HWTrue = DAG.getConstant(-1, CompareVT);
 742     HWFalse = DAG.getConstant(0, CompareVT);
 743   }
 744   else {
 745     assert(!"Unhandled value type in LowerSELECT_CC");
 746   }
 747
 748   // Lower this unsupported SELECT_CC into a combination of two supported
 749   // SELECT_CC operations.
 750   SDValue Cond = DAG.getNode(ISD::SELECT_CC, DL, CompareVT, LHS, RHS, HWTrue, HWFalse, CC);
 751
 752   return DAG.getNode(ISD::SELECT_CC, DL, VT,
 753       Cond, HWFalse,
 754       True, False,
 755       DAG.getCondCode(ISD::SETNE));
 756 }
 757
 758 SDValue R600TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
 759   return DAG.getNode(ISD::SELECT_CC,
 760       Op.getDebugLoc(),
 761       Op.getValueType(),
 762       Op.getOperand(0),
 763       DAG.getConstant(0, MVT::i32),
 764       Op.getOperand(1),
 765       Op.getOperand(2),
 766       DAG.getCondCode(ISD::SETNE));
 767 }
 768
 769 SDValue R600TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
 770   SDValue Cond;
 771   SDValue LHS = Op.getOperand(0);
 772   SDValue RHS = Op.getOperand(1);
 773   SDValue CC  = Op.getOperand(2);
 774   DebugLoc DL = Op.getDebugLoc();
 775   assert(Op.getValueType() == MVT::i32);
 776   if (LHS.getValueType() == MVT::i32) {
 777     Cond = DAG.getNode(
 778         ISD::SELECT_CC,
 779         Op.getDebugLoc(),
 780         MVT::i32,
 781         LHS, RHS,
 782         DAG.getConstant(-1, MVT::i32),
 783         DAG.getConstant(0, MVT::i32),
 784         CC);
 785   } else if (LHS.getValueType() == MVT::f32) {
 786     Cond = DAG.getNode(
 787         ISD::SELECT_CC,
 788         Op.getDebugLoc(),
 789         MVT::f32,
 790         LHS, RHS,
 791         DAG.getConstantFP(1.0f, MVT::f32),
 792         DAG.getConstantFP(0.0f, MVT::f32),
 793         CC);
 794     Cond = DAG.getNode(
 795         ISD::FP_TO_SINT,
 796         DL,
 797         MVT::i32,
 798         Cond);
 799   } else {
 800     assert(0 && "Not valid type for set_cc");
 801   }
 802   Cond = DAG.getNode(
 803       ISD::AND,
 804       DL,
 805       MVT::i32,
 806       DAG.getConstant(1, MVT::i32),
 807       Cond);
 808   return Cond;
 809 }
 810
 811 SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
 812   DebugLoc DL = Op.getDebugLoc();
 813   StoreSDNode *StoreNode = cast<StoreSDNode>(Op);
 814   SDValue Chain = Op.getOperand(0);
 815   SDValue Value = Op.getOperand(1);
 816   SDValue Ptr = Op.getOperand(2);
 817
 818   if (StoreNode->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS &&
 819       Ptr->getOpcode() != AMDGPUISD::DWORDADDR) {
 820     // Convert pointer from byte address to dword address.
 821     Ptr = DAG.getNode(AMDGPUISD::DWORDADDR, DL, Ptr.getValueType(),
 822                       DAG.getNode(ISD::SRL, DL, Ptr.getValueType(),
 823                                   Ptr, DAG.getConstant(2, MVT::i32)));
 824
 825     if (StoreNode->isTruncatingStore() || StoreNode->isIndexed()) {
 826       assert(!"Truncated and indexed stores not supported yet");
 827     } else {
 828       Chain = DAG.getStore(Chain, DL, Value, Ptr, StoreNode->getMemOperand());
 829     }
 830     return Chain;
 831   }
 832   return SDValue();
 833 }
 834
 835
 836 SDValue R600TargetLowering::LowerFPOW(SDValue Op,
 837     SelectionDAG &DAG) const {
 838   DebugLoc DL = Op.getDebugLoc();
 839   EVT VT = Op.getValueType();
 840   SDValue LogBase = DAG.getNode(ISD::FLOG2, DL, VT, Op.getOperand(0));
 841   SDValue MulLogBase = DAG.getNode(ISD::FMUL, DL, VT, Op.getOperand(1), LogBase);
 842   return DAG.getNode(ISD::FEXP2, DL, VT, MulLogBase);
 843 }
 844
 845 /// XXX Only kernel functions are supported, so we can assume for now that
 846 /// every function is a kernel function, but in the future we should use
 847 /// separate calling conventions for kernel and non-kernel functions.
 848 SDValue R600TargetLowering::LowerFormalArguments(
 849                                       SDValue Chain,
 850                                       CallingConv::ID CallConv,
 851                                       bool isVarArg,
 852                                       const SmallVectorImpl<ISD::InputArg> &Ins,
 853                                       DebugLoc DL, SelectionDAG &DAG,
 854                                       SmallVectorImpl<SDValue> &InVals) const {
 855   unsigned ParamOffsetBytes = 36;
 856   Function::const_arg_iterator FuncArg =
 857                             DAG.getMachineFunction().getFunction()->arg_begin();
 858   for (unsigned i = 0, e = Ins.size(); i < e; ++i, ++FuncArg) {
 859     EVT VT = Ins[i].VT;
 860     Type *ArgType = FuncArg->getType();
 861     unsigned ArgSizeInBits = ArgType->isPointerTy() ?
 862                              32 : ArgType->getPrimitiveSizeInBits();
 863     unsigned ArgBytes = ArgSizeInBits >> 3;
 864     EVT ArgVT;
 865     if (ArgSizeInBits < VT.getSizeInBits()) {
 866       assert(!ArgType->isFloatTy() &&
 867              "Extending floating point arguments not supported yet");
 868       ArgVT = MVT::getIntegerVT(ArgSizeInBits);
 869     } else {
 870       ArgVT = VT;
 871     }
 872     PointerType *PtrTy = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
 873                                                     AMDGPUAS::PARAM_I_ADDRESS);
 874     SDValue Arg = DAG.getExtLoad(ISD::ZEXTLOAD, DL, VT, DAG.getRoot(),
 875                                 DAG.getConstant(ParamOffsetBytes, MVT::i32),
 876                                        MachinePointerInfo(new Argument(PtrTy)),
 877                                        ArgVT, false, false, ArgBytes);
 878     InVals.push_back(Arg);
 879     ParamOffsetBytes += ArgBytes;
 880   }
 881   return Chain;
 882 }
 883
 884 EVT R600TargetLowering::getSetCCResultType(EVT VT) const {
 885    if (!VT.isVector()) return MVT::i32;
 886    return VT.changeVectorElementTypeToInteger();
 887 }
 888
 889 //===----------------------------------------------------------------------===//
 890 // Custom DAG Optimizations
 891 //===----------------------------------------------------------------------===//
 892
 893 SDValue R600TargetLowering::PerformDAGCombine(SDNode *N,
 894                                               DAGCombinerInfo &DCI) const {
 895   SelectionDAG &DAG = DCI.DAG;
 896
 897   switch (N->getOpcode()) {
 898   // (f32 fp_round (f64 uint_to_fp a)) -> (f32 uint_to_fp a)
 899   case ISD::FP_ROUND: {
 900       SDValue Arg = N->getOperand(0);
 901       if (Arg.getOpcode() == ISD::UINT_TO_FP && Arg.getValueType() == MVT::f64) {
 902         return DAG.getNode(ISD::UINT_TO_FP, N->getDebugLoc(), N->getValueType(0),
 903                            Arg.getOperand(0));
 904       }
 905       break;
 906     }
 907   }
 908   return SDValue();
 909 }