lib/Target/R600/R600ISelLowering.cpp

   1 //===-- R600ISelLowering.cpp - R600 DAG Lowering Implementation -----------===//
   2 //
   3 //                     The LLVM Compiler Infrastructure
   4 //
   5 // This file is distributed under the University of Illinois Open Source
   6 // License. See LICENSE.TXT for details.
   7 //
   8 //===----------------------------------------------------------------------===//
   9 //
  10 /// \file
  11 /// \brief Custom DAG lowering for R600
  12 //
  13 //===----------------------------------------------------------------------===//
  14
  15 #include "R600ISelLowering.h"
  16 #include "R600Defines.h"
  17 #include "R600InstrInfo.h"
  18 #include "R600MachineFunctionInfo.h"
  19 #include "llvm/CodeGen/MachineFrameInfo.h"
  20 #include "llvm/CodeGen/MachineInstrBuilder.h"
  21 #include "llvm/CodeGen/MachineRegisterInfo.h"
  22 #include "llvm/CodeGen/SelectionDAG.h"
  23 #include "llvm/IR/Argument.h"
  24 #include "llvm/IR/Function.h"
  25
  26 using namespace llvm;
  27
  28 R600TargetLowering::R600TargetLowering(TargetMachine &TM) :
  29     AMDGPUTargetLowering(TM),
  30     TII(static_cast<const R600InstrInfo*>(TM.getInstrInfo())) {
  31   setOperationAction(ISD::MUL, MVT::i64, Expand);
  32   addRegisterClass(MVT::v4f32, &AMDGPU::R600_Reg128RegClass);
  33   addRegisterClass(MVT::f32, &AMDGPU::R600_Reg32RegClass);
  34   addRegisterClass(MVT::v4i32, &AMDGPU::R600_Reg128RegClass);
  35   addRegisterClass(MVT::i32, &AMDGPU::R600_Reg32RegClass);
  36   computeRegisterProperties();
  37
  38   setOperationAction(ISD::FADD, MVT::v4f32, Expand);
  39   setOperationAction(ISD::FMUL, MVT::v4f32, Expand);
  40   setOperationAction(ISD::FDIV, MVT::v4f32, Expand);
  41   setOperationAction(ISD::FSUB, MVT::v4f32, Expand);
  42
  43   setOperationAction(ISD::ADD,  MVT::v4i32, Expand);
  44   setOperationAction(ISD::AND,  MVT::v4i32, Expand);
  45   setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Expand);
  46   setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Expand);
  47   setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Expand);
  48   setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Expand);
  49   setOperationAction(ISD::UDIV, MVT::v4i32, Expand);
  50   setOperationAction(ISD::UREM, MVT::v4i32, Expand);
  51   setOperationAction(ISD::SETCC, MVT::v4i32, Expand);
  52
  53   setOperationAction(ISD::BR_CC, MVT::i32, Expand);
  54   setOperationAction(ISD::BR_CC, MVT::f32, Expand);
  55
  56   setOperationAction(ISD::FSUB, MVT::f32, Expand);
  57
  58   setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
  59   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
  60   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i1, Custom);
  61   setOperationAction(ISD::FPOW, MVT::f32, Custom);
  62
  63   setOperationAction(ISD::ROTL, MVT::i32, Custom);
  64
  65   setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
  66   setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
  67
  68   setOperationAction(ISD::SETCC, MVT::i32, Expand);
  69   setOperationAction(ISD::SETCC, MVT::f32, Expand);
  70   setOperationAction(ISD::FP_TO_UINT, MVT::i1, Custom);
  71
  72   setOperationAction(ISD::SELECT, MVT::i32, Custom);
  73   setOperationAction(ISD::SELECT, MVT::f32, Custom);
  74
  75   // Legalize loads and stores to the private address space.
  76   setOperationAction(ISD::LOAD, MVT::i32, Custom);
  77   setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
  78   setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
  79   setLoadExtAction(ISD::EXTLOAD, MVT::v4i8, Custom);
  80   setLoadExtAction(ISD::EXTLOAD, MVT::i8, Custom);
  81   setLoadExtAction(ISD::ZEXTLOAD, MVT::i8, Custom);
  82   setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i8, Custom);
  83   setOperationAction(ISD::STORE, MVT::i8, Custom);
  84   setOperationAction(ISD::STORE, MVT::i32, Custom);
  85   setOperationAction(ISD::STORE, MVT::v2i32, Custom);
  86   setOperationAction(ISD::STORE, MVT::v4i32, Custom);
  87
  88   setOperationAction(ISD::LOAD, MVT::i32, Custom);
  89   setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
  90   setOperationAction(ISD::FrameIndex, MVT::i32, Custom);
  91
  92   setTargetDAGCombine(ISD::FP_ROUND);
  93   setTargetDAGCombine(ISD::FP_TO_SINT);
  94   setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
  95   setTargetDAGCombine(ISD::SELECT_CC);
  96
  97   setBooleanContents(ZeroOrNegativeOneBooleanContent);
  98   setSchedulingPreference(Sched::VLIW);
  99 }
 100
 101 MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter(
 102     MachineInstr * MI, MachineBasicBlock * BB) const {
 103   MachineFunction * MF = BB->getParent();
 104   MachineRegisterInfo &MRI = MF->getRegInfo();
 105   MachineBasicBlock::iterator I = *MI;
 106
 107   switch (MI->getOpcode()) {
 108   default: return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
 109   case AMDGPU::CLAMP_R600: {
 110     MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
 111                                                    AMDGPU::MOV,
 112                                                    MI->getOperand(0).getReg(),
 113                                                    MI->getOperand(1).getReg());
 114     TII->addFlag(NewMI, 0, MO_FLAG_CLAMP);
 115     break;
 116   }
 117
 118   case AMDGPU::FABS_R600: {
 119     MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
 120                                                     AMDGPU::MOV,
 121                                                     MI->getOperand(0).getReg(),
 122                                                     MI->getOperand(1).getReg());
 123     TII->addFlag(NewMI, 0, MO_FLAG_ABS);
 124     break;
 125   }
 126
 127   case AMDGPU::FNEG_R600: {
 128     MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
 129                                                     AMDGPU::MOV,
 130                                                     MI->getOperand(0).getReg(),
 131                                                     MI->getOperand(1).getReg());
 132     TII->addFlag(NewMI, 0, MO_FLAG_NEG);
 133     break;
 134   }
 135
 136   case AMDGPU::MASK_WRITE: {
 137     unsigned maskedRegister = MI->getOperand(0).getReg();
 138     assert(TargetRegisterInfo::isVirtualRegister(maskedRegister));
 139     MachineInstr * defInstr = MRI.getVRegDef(maskedRegister);
 140     TII->addFlag(defInstr, 0, MO_FLAG_MASK);
 141     break;
 142   }
 143
 144   case AMDGPU::MOV_IMM_F32:
 145     TII->buildMovImm(*BB, I, MI->getOperand(0).getReg(),
 146                      MI->getOperand(1).getFPImm()->getValueAPF()
 147                          .bitcastToAPInt().getZExtValue());
 148     break;
 149   case AMDGPU::MOV_IMM_I32:
 150     TII->buildMovImm(*BB, I, MI->getOperand(0).getReg(),
 151                      MI->getOperand(1).getImm());
 152     break;
 153   case AMDGPU::CONST_COPY: {
 154     MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, MI, AMDGPU::MOV,
 155         MI->getOperand(0).getReg(), AMDGPU::ALU_CONST);
 156     TII->setImmOperand(NewMI, R600Operands::SRC0_SEL,
 157         MI->getOperand(1).getImm());
 158     break;
 159   }
 160
 161   case AMDGPU::RAT_WRITE_CACHELESS_32_eg:
 162   case AMDGPU::RAT_WRITE_CACHELESS_128_eg: {
 163     unsigned EOP = (llvm::next(I)->getOpcode() == AMDGPU::RETURN) ? 1 : 0;
 164
 165     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode()))
 166             .addOperand(MI->getOperand(0))
 167             .addOperand(MI->getOperand(1))
 168             .addImm(EOP); // Set End of program bit
 169     break;
 170   }
 171
 172   case AMDGPU::TXD: {
 173     unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
 174     unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
 175
 176     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), T0)
 177             .addOperand(MI->getOperand(3))
 178             .addOperand(MI->getOperand(4))
 179             .addOperand(MI->getOperand(5))
 180             .addOperand(MI->getOperand(6));
 181     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), T1)
 182             .addOperand(MI->getOperand(2))
 183             .addOperand(MI->getOperand(4))
 184             .addOperand(MI->getOperand(5))
 185             .addOperand(MI->getOperand(6));
 186     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_G))
 187             .addOperand(MI->getOperand(0))
 188             .addOperand(MI->getOperand(1))
 189             .addOperand(MI->getOperand(4))
 190             .addOperand(MI->getOperand(5))
 191             .addOperand(MI->getOperand(6))
 192             .addReg(T0, RegState::Implicit)
 193             .addReg(T1, RegState::Implicit);
 194     break;
 195   }
 196
 197   case AMDGPU::TXD_SHADOW: {
 198     unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
 199     unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
 200
 201     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), T0)
 202             .addOperand(MI->getOperand(3))
 203             .addOperand(MI->getOperand(4))
 204             .addOperand(MI->getOperand(5))
 205             .addOperand(MI->getOperand(6));
 206     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), T1)
 207             .addOperand(MI->getOperand(2))
 208             .addOperand(MI->getOperand(4))
 209             .addOperand(MI->getOperand(5))
 210             .addOperand(MI->getOperand(6));
 211     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_C_G))
 212             .addOperand(MI->getOperand(0))
 213             .addOperand(MI->getOperand(1))
 214             .addOperand(MI->getOperand(4))
 215             .addOperand(MI->getOperand(5))
 216             .addOperand(MI->getOperand(6))
 217             .addReg(T0, RegState::Implicit)
 218             .addReg(T1, RegState::Implicit);
 219     break;
 220   }
 221
 222   case AMDGPU::BRANCH:
 223       BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP))
 224               .addOperand(MI->getOperand(0));
 225       break;
 226
 227   case AMDGPU::BRANCH_COND_f32: {
 228     MachineInstr *NewMI =
 229       BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X),
 230               AMDGPU::PREDICATE_BIT)
 231               .addOperand(MI->getOperand(1))
 232               .addImm(OPCODE_IS_NOT_ZERO)
 233               .addImm(0); // Flags
 234     TII->addFlag(NewMI, 0, MO_FLAG_PUSH);
 235     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND))
 236             .addOperand(MI->getOperand(0))
 237             .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
 238     break;
 239   }
 240
 241   case AMDGPU::BRANCH_COND_i32: {
 242     MachineInstr *NewMI =
 243       BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X),
 244             AMDGPU::PREDICATE_BIT)
 245             .addOperand(MI->getOperand(1))
 246             .addImm(OPCODE_IS_NOT_ZERO_INT)
 247             .addImm(0); // Flags
 248     TII->addFlag(NewMI, 0, MO_FLAG_PUSH);
 249     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND))
 250            .addOperand(MI->getOperand(0))
 251             .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
 252     break;
 253   }
 254
 255   case AMDGPU::EG_ExportSwz:
 256   case AMDGPU::R600_ExportSwz: {
 257     // Instruction is left unmodified if its not the last one of its type
 258     bool isLastInstructionOfItsType = true;
 259     unsigned InstExportType = MI->getOperand(1).getImm();
 260     for (MachineBasicBlock::iterator NextExportInst = llvm::next(I),
 261          EndBlock = BB->end(); NextExportInst != EndBlock;
 262          NextExportInst = llvm::next(NextExportInst)) {
 263       if (NextExportInst->getOpcode() == AMDGPU::EG_ExportSwz ||
 264           NextExportInst->getOpcode() == AMDGPU::R600_ExportSwz) {
 265         unsigned CurrentInstExportType = NextExportInst->getOperand(1)
 266             .getImm();
 267         if (CurrentInstExportType == InstExportType) {
 268           isLastInstructionOfItsType = false;
 269           break;
 270         }
 271       }
 272     }
 273     bool EOP = (llvm::next(I)->getOpcode() == AMDGPU::RETURN)? 1 : 0;
 274     if (!EOP && !isLastInstructionOfItsType)
 275       return BB;
 276     unsigned CfInst = (MI->getOpcode() == AMDGPU::EG_ExportSwz)? 84 : 40;
 277     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode()))
 278             .addOperand(MI->getOperand(0))
 279             .addOperand(MI->getOperand(1))
 280             .addOperand(MI->getOperand(2))
 281             .addOperand(MI->getOperand(3))
 282             .addOperand(MI->getOperand(4))
 283             .addOperand(MI->getOperand(5))
 284             .addOperand(MI->getOperand(6))
 285             .addImm(CfInst)
 286             .addImm(EOP);
 287     break;
 288   }
 289   case AMDGPU::RETURN: {
 290     // RETURN instructions must have the live-out registers as implicit uses,
 291     // otherwise they appear dead.
 292     R600MachineFunctionInfo *MFI = MF->getInfo<R600MachineFunctionInfo>();
 293     MachineInstrBuilder MIB(*MF, MI);
 294     for (unsigned i = 0, e = MFI->LiveOuts.size(); i != e; ++i)
 295       MIB.addReg(MFI->LiveOuts[i], RegState::Implicit);
 296     return BB;
 297   }
 298   }
 299
 300   MI->eraseFromParent();
 301   return BB;
 302 }
 303
 304 //===----------------------------------------------------------------------===//
 305 // Custom DAG Lowering Operations
 306 //===----------------------------------------------------------------------===//
 307
 308 using namespace llvm::Intrinsic;
 309 using namespace llvm::AMDGPUIntrinsic;
 310
 311 SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
 312   switch (Op.getOpcode()) {
 313   default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
 314   case ISD::ROTL: return LowerROTL(Op, DAG);
 315   case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
 316   case ISD::SELECT: return LowerSELECT(Op, DAG);
 317   case ISD::STORE: return LowerSTORE(Op, DAG);
 318   case ISD::LOAD: return LowerLOAD(Op, DAG);
 319   case ISD::FPOW: return LowerFPOW(Op, DAG);
 320   case ISD::FrameIndex: return LowerFrameIndex(Op, DAG);
 321   case ISD::INTRINSIC_VOID: {
 322     SDValue Chain = Op.getOperand(0);
 323     unsigned IntrinsicID =
 324                          cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
 325     switch (IntrinsicID) {
 326     case AMDGPUIntrinsic::AMDGPU_store_output: {
 327       MachineFunction &MF = DAG.getMachineFunction();
 328       R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
 329       int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
 330       unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister(RegIndex);
 331       MFI->LiveOuts.push_back(Reg);
 332       return DAG.getCopyToReg(Chain, Op.getDebugLoc(), Reg, Op.getOperand(2));
 333     }
 334     case AMDGPUIntrinsic::R600_store_swizzle: {
 335       const SDValue Args[8] = {
 336         Chain,
 337         Op.getOperand(2), // Export Value
 338         Op.getOperand(3), // ArrayBase
 339         Op.getOperand(4), // Type
 340         DAG.getConstant(0, MVT::i32), // SWZ_X
 341         DAG.getConstant(1, MVT::i32), // SWZ_Y
 342         DAG.getConstant(2, MVT::i32), // SWZ_Z
 343         DAG.getConstant(3, MVT::i32) // SWZ_W
 344       };
 345       return DAG.getNode(AMDGPUISD::EXPORT, Op.getDebugLoc(), Op.getValueType(),
 346           Args, 8);
 347     }
 348
 349     // default for switch(IntrinsicID)
 350     default: break;
 351     }
 352     // break out of case ISD::INTRINSIC_VOID in switch(Op.getOpcode())
 353     break;
 354   }
 355   case ISD::INTRINSIC_WO_CHAIN: {
 356     unsigned IntrinsicID =
 357                          cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
 358     EVT VT = Op.getValueType();
 359     DebugLoc DL = Op.getDebugLoc();
 360     switch(IntrinsicID) {
 361     default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
 362     case AMDGPUIntrinsic::R600_load_input: {
 363       int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
 364       unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister(RegIndex);
 365       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass, Reg, VT);
 366     }
 367
 368     case AMDGPUIntrinsic::R600_interp_input: {
 369       int slot = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
 370       int ijb = cast<ConstantSDNode>(Op.getOperand(2))->getSExtValue();
 371       MachineSDNode *interp;
 372       if (ijb < 0) {
 373         interp = DAG.getMachineNode(AMDGPU::INTERP_VEC_LOAD, DL,
 374             MVT::v4f32, DAG.getTargetConstant(slot / 4 , MVT::i32));
 375         return DAG.getTargetExtractSubreg(
 376             TII->getRegisterInfo().getSubRegFromChannel(slot % 4),
 377             DL, MVT::f32, SDValue(interp, 0));
 378       }
 379
 380       if (slot % 4 < 2)
 381         interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_XY, DL,
 382             MVT::f32, MVT::f32, DAG.getTargetConstant(slot / 4 , MVT::i32),
 383             CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 384                 AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb + 1), MVT::f32),
 385             CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 386                 AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb), MVT::f32));
 387       else
 388         interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_ZW, DL,
 389             MVT::f32, MVT::f32, DAG.getTargetConstant(slot / 4 , MVT::i32),
 390             CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 391                 AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb + 1), MVT::f32),
 392             CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 393                 AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb), MVT::f32));
 394
 395       return SDValue(interp, slot % 2);
 396     }
 397
 398     case r600_read_ngroups_x:
 399       return LowerImplicitParameter(DAG, VT, DL, 0);
 400     case r600_read_ngroups_y:
 401       return LowerImplicitParameter(DAG, VT, DL, 1);
 402     case r600_read_ngroups_z:
 403       return LowerImplicitParameter(DAG, VT, DL, 2);
 404     case r600_read_global_size_x:
 405       return LowerImplicitParameter(DAG, VT, DL, 3);
 406     case r600_read_global_size_y:
 407       return LowerImplicitParameter(DAG, VT, DL, 4);
 408     case r600_read_global_size_z:
 409       return LowerImplicitParameter(DAG, VT, DL, 5);
 410     case r600_read_local_size_x:
 411       return LowerImplicitParameter(DAG, VT, DL, 6);
 412     case r600_read_local_size_y:
 413       return LowerImplicitParameter(DAG, VT, DL, 7);
 414     case r600_read_local_size_z:
 415       return LowerImplicitParameter(DAG, VT, DL, 8);
 416
 417     case r600_read_tgid_x:
 418       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 419                                   AMDGPU::T1_X, VT);
 420     case r600_read_tgid_y:
 421       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 422                                   AMDGPU::T1_Y, VT);
 423     case r600_read_tgid_z:
 424       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 425                                   AMDGPU::T1_Z, VT);
 426     case r600_read_tidig_x:
 427       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 428                                   AMDGPU::T0_X, VT);
 429     case r600_read_tidig_y:
 430       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 431                                   AMDGPU::T0_Y, VT);
 432     case r600_read_tidig_z:
 433       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 434                                   AMDGPU::T0_Z, VT);
 435     }
 436     // break out of case ISD::INTRINSIC_WO_CHAIN in switch(Op.getOpcode())
 437     break;
 438   }
 439   } // end switch(Op.getOpcode())
 440   return SDValue();
 441 }
 442
 443 void R600TargetLowering::ReplaceNodeResults(SDNode *N,
 444                                             SmallVectorImpl<SDValue> &Results,
 445                                             SelectionDAG &DAG) const {
 446   switch (N->getOpcode()) {
 447   default: return;
 448   case ISD::FP_TO_UINT: Results.push_back(LowerFPTOUINT(N->getOperand(0), DAG));
 449     return;
 450   case ISD::LOAD: {
 451     SDNode *Node = LowerLOAD(SDValue(N, 0), DAG).getNode();
 452     Results.push_back(SDValue(Node, 0));
 453     Results.push_back(SDValue(Node, 1));
 454     // XXX: LLVM seems not to replace Chain Value inside CustomWidenLowerNode
 455     // function
 456     DAG.ReplaceAllUsesOfValueWith(SDValue(N,1), SDValue(Node, 1));
 457     return;
 458   }
 459   case ISD::STORE:
 460     SDNode *Node = LowerSTORE(SDValue(N, 0), DAG).getNode();
 461     Results.push_back(SDValue(Node, 0));
 462     return;
 463   }
 464 }
 465
 466 SDValue R600TargetLowering::LowerFPTOUINT(SDValue Op, SelectionDAG &DAG) const {
 467   return DAG.getNode(
 468       ISD::SETCC,
 469       Op.getDebugLoc(),
 470       MVT::i1,
 471       Op, DAG.getConstantFP(0.0f, MVT::f32),
 472       DAG.getCondCode(ISD::SETNE)
 473       );
 474 }
 475
 476 SDValue R600TargetLowering::LowerImplicitParameter(SelectionDAG &DAG, EVT VT,
 477                                                    DebugLoc DL,
 478                                                    unsigned DwordOffset) const {
 479   unsigned ByteOffset = DwordOffset * 4;
 480   PointerType * PtrType = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
 481                                       AMDGPUAS::PARAM_I_ADDRESS);
 482
 483   // We shouldn't be using an offset wider than 16-bits for implicit parameters.
 484   assert(isInt<16>(ByteOffset));
 485
 486   return DAG.getLoad(VT, DL, DAG.getEntryNode(),
 487                      DAG.getConstant(ByteOffset, MVT::i32), // PTR
 488                      MachinePointerInfo(ConstantPointerNull::get(PtrType)),
 489                      false, false, false, 0);
 490 }
 491
 492 SDValue R600TargetLowering::LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const {
 493
 494   MachineFunction &MF = DAG.getMachineFunction();
 495   const AMDGPUFrameLowering *TFL =
 496    static_cast<const AMDGPUFrameLowering*>(getTargetMachine().getFrameLowering());
 497
 498   FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Op);
 499   assert(FIN);
 500
 501   unsigned FrameIndex = FIN->getIndex();
 502   unsigned Offset = TFL->getFrameIndexOffset(MF, FrameIndex);
 503   return DAG.getConstant(Offset * 4 * TFL->getStackWidth(MF), MVT::i32);
 504 }
 505
 506 SDValue R600TargetLowering::LowerROTL(SDValue Op, SelectionDAG &DAG) const {
 507   DebugLoc DL = Op.getDebugLoc();
 508   EVT VT = Op.getValueType();
 509
 510   return DAG.getNode(AMDGPUISD::BITALIGN, DL, VT,
 511                      Op.getOperand(0),
 512                      Op.getOperand(0),
 513                      DAG.getNode(ISD::SUB, DL, VT,
 514                                  DAG.getConstant(32, MVT::i32),
 515                                  Op.getOperand(1)));
 516 }
 517
 518 bool R600TargetLowering::isZero(SDValue Op) const {
 519   if(ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
 520     return Cst->isNullValue();
 521   } else if(ConstantFPSDNode *CstFP = dyn_cast<ConstantFPSDNode>(Op)){
 522     return CstFP->isZero();
 523   } else {
 524     return false;
 525   }
 526 }
 527
 528 SDValue R600TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
 529   DebugLoc DL = Op.getDebugLoc();
 530   EVT VT = Op.getValueType();
 531
 532   SDValue LHS = Op.getOperand(0);
 533   SDValue RHS = Op.getOperand(1);
 534   SDValue True = Op.getOperand(2);
 535   SDValue False = Op.getOperand(3);
 536   SDValue CC = Op.getOperand(4);
 537   SDValue Temp;
 538
 539   // LHS and RHS are guaranteed to be the same value type
 540   EVT CompareVT = LHS.getValueType();
 541
 542   // Check if we can lower this to a native operation.
 543
 544   // Try to lower to a SET* instruction:
 545   //
 546   // SET* can match the following patterns:
 547   //
 548   // select_cc f32, f32, -1,  0, cc_any
 549   // select_cc f32, f32, 1.0f, 0.0f, cc_any
 550   // select_cc i32, i32, -1,  0, cc_any
 551   //
 552
 553   // Move hardware True/False values to the correct operand.
 554   if (isHWTrueValue(False) && isHWFalseValue(True)) {
 555     ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
 556     std::swap(False, True);
 557     CC = DAG.getCondCode(ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32));
 558   }
 559
 560   if (isHWTrueValue(True) && isHWFalseValue(False) &&
 561       (CompareVT == VT || VT == MVT::i32)) {
 562     // This can be matched by a SET* instruction.
 563     return DAG.getNode(ISD::SELECT_CC, DL, VT, LHS, RHS, True, False, CC);
 564   }
 565
 566   // Try to lower to a CND* instruction:
 567   //
 568   // CND* can match the following patterns:
 569   //
 570   // select_cc f32, 0.0, f32, f32, cc_any
 571   // select_cc f32, 0.0, i32, i32, cc_any
 572   // select_cc i32, 0,   f32, f32, cc_any
 573   // select_cc i32, 0,   i32, i32, cc_any
 574   //
 575   if (isZero(LHS) || isZero(RHS)) {
 576     SDValue Cond = (isZero(LHS) ? RHS : LHS);
 577     SDValue Zero = (isZero(LHS) ? LHS : RHS);
 578     ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
 579     if (CompareVT != VT) {
 580       // Bitcast True / False to the correct types.  This will end up being
 581       // a nop, but it allows us to define only a single pattern in the
 582       // .TD files for each CND* instruction rather than having to have
 583       // one pattern for integer True/False and one for fp True/False
 584       True = DAG.getNode(ISD::BITCAST, DL, CompareVT, True);
 585       False = DAG.getNode(ISD::BITCAST, DL, CompareVT, False);
 586     }
 587     if (isZero(LHS)) {
 588       CCOpcode = ISD::getSetCCSwappedOperands(CCOpcode);
 589     }
 590
 591     switch (CCOpcode) {
 592     case ISD::SETONE:
 593     case ISD::SETUNE:
 594     case ISD::SETNE:
 595     case ISD::SETULE:
 596     case ISD::SETULT:
 597     case ISD::SETOLE:
 598     case ISD::SETOLT:
 599     case ISD::SETLE:
 600     case ISD::SETLT:
 601       CCOpcode = ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32);
 602       Temp = True;
 603       True = False;
 604       False = Temp;
 605       break;
 606     default:
 607       break;
 608     }
 609     SDValue SelectNode = DAG.getNode(ISD::SELECT_CC, DL, CompareVT,
 610         Cond, Zero,
 611         True, False,
 612         DAG.getCondCode(CCOpcode));
 613     return DAG.getNode(ISD::BITCAST, DL, VT, SelectNode);
 614   }
 615
 616
 617   // Possible Min/Max pattern
 618   SDValue MinMax = LowerMinMax(Op, DAG);
 619   if (MinMax.getNode()) {
 620     return MinMax;
 621   }
 622
 623   // If we make it this for it means we have no native instructions to handle
 624   // this SELECT_CC, so we must lower it.
 625   SDValue HWTrue, HWFalse;
 626
 627   if (CompareVT == MVT::f32) {
 628     HWTrue = DAG.getConstantFP(1.0f, CompareVT);
 629     HWFalse = DAG.getConstantFP(0.0f, CompareVT);
 630   } else if (CompareVT == MVT::i32) {
 631     HWTrue = DAG.getConstant(-1, CompareVT);
 632     HWFalse = DAG.getConstant(0, CompareVT);
 633   }
 634   else {
 635     assert(!"Unhandled value type in LowerSELECT_CC");
 636   }
 637
 638   // Lower this unsupported SELECT_CC into a combination of two supported
 639   // SELECT_CC operations.
 640   SDValue Cond = DAG.getNode(ISD::SELECT_CC, DL, CompareVT, LHS, RHS, HWTrue, HWFalse, CC);
 641
 642   return DAG.getNode(ISD::SELECT_CC, DL, VT,
 643       Cond, HWFalse,
 644       True, False,
 645       DAG.getCondCode(ISD::SETNE));
 646 }
 647
 648 SDValue R600TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
 649   return DAG.getNode(ISD::SELECT_CC,
 650       Op.getDebugLoc(),
 651       Op.getValueType(),
 652       Op.getOperand(0),
 653       DAG.getConstant(0, MVT::i32),
 654       Op.getOperand(1),
 655       Op.getOperand(2),
 656       DAG.getCondCode(ISD::SETNE));
 657 }
 658
 659 /// LLVM generates byte-addresed pointers.  For indirect addressing, we need to
 660 /// convert these pointers to a register index.  Each register holds
 661 /// 16 bytes, (4 x 32bit sub-register), but we need to take into account the
 662 /// \p StackWidth, which tells us how many of the 4 sub-registrers will be used
 663 /// for indirect addressing.
 664 SDValue R600TargetLowering::stackPtrToRegIndex(SDValue Ptr,
 665                                                unsigned StackWidth,
 666                                                SelectionDAG &DAG) const {
 667   unsigned SRLPad;
 668   switch(StackWidth) {
 669   case 1:
 670     SRLPad = 2;
 671     break;
 672   case 2:
 673     SRLPad = 3;
 674     break;
 675   case 4:
 676     SRLPad = 4;
 677     break;
 678   default: llvm_unreachable("Invalid stack width");
 679   }
 680
 681   return DAG.getNode(ISD::SRL, Ptr.getDebugLoc(), Ptr.getValueType(), Ptr,
 682                      DAG.getConstant(SRLPad, MVT::i32));
 683 }
 684
 685 void R600TargetLowering::getStackAddress(unsigned StackWidth,
 686                                          unsigned ElemIdx,
 687                                          unsigned &Channel,
 688                                          unsigned &PtrIncr) const {
 689   switch (StackWidth) {
 690   default:
 691   case 1:
 692     Channel = 0;
 693     if (ElemIdx > 0) {
 694       PtrIncr = 1;
 695     } else {
 696       PtrIncr = 0;
 697     }
 698     break;
 699   case 2:
 700     Channel = ElemIdx % 2;
 701     if (ElemIdx == 2) {
 702       PtrIncr = 1;
 703     } else {
 704       PtrIncr = 0;
 705     }
 706     break;
 707   case 4:
 708     Channel = ElemIdx;
 709     PtrIncr = 0;
 710     break;
 711   }
 712 }
 713
 714 SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
 715   DebugLoc DL = Op.getDebugLoc();
 716   StoreSDNode *StoreNode = cast<StoreSDNode>(Op);
 717   SDValue Chain = Op.getOperand(0);
 718   SDValue Value = Op.getOperand(1);
 719   SDValue Ptr = Op.getOperand(2);
 720
 721   if (StoreNode->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS &&
 722       Ptr->getOpcode() != AMDGPUISD::DWORDADDR) {
 723     // Convert pointer from byte address to dword address.
 724     Ptr = DAG.getNode(AMDGPUISD::DWORDADDR, DL, Ptr.getValueType(),
 725                       DAG.getNode(ISD::SRL, DL, Ptr.getValueType(),
 726                                   Ptr, DAG.getConstant(2, MVT::i32)));
 727
 728     if (StoreNode->isTruncatingStore() || StoreNode->isIndexed()) {
 729       assert(!"Truncated and indexed stores not supported yet");
 730     } else {
 731       Chain = DAG.getStore(Chain, DL, Value, Ptr, StoreNode->getMemOperand());
 732     }
 733     return Chain;
 734   }
 735
 736   EVT ValueVT = Value.getValueType();
 737
 738   if (StoreNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) {
 739     return SDValue();
 740   }
 741
 742   // Lowering for indirect addressing
 743
 744   const MachineFunction &MF = DAG.getMachineFunction();
 745   const AMDGPUFrameLowering *TFL = static_cast<const AMDGPUFrameLowering*>(
 746                                          getTargetMachine().getFrameLowering());
 747   unsigned StackWidth = TFL->getStackWidth(MF);
 748
 749   Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG);
 750
 751   if (ValueVT.isVector()) {
 752     unsigned NumElemVT = ValueVT.getVectorNumElements();
 753     EVT ElemVT = ValueVT.getVectorElementType();
 754     SDValue Stores[4];
 755
 756     assert(NumElemVT >= StackWidth && "Stack width cannot be greater than "
 757                                       "vector width in load");
 758
 759     for (unsigned i = 0; i < NumElemVT; ++i) {
 760       unsigned Channel, PtrIncr;
 761       getStackAddress(StackWidth, i, Channel, PtrIncr);
 762       Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr,
 763                         DAG.getConstant(PtrIncr, MVT::i32));
 764       SDValue Elem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ElemVT,
 765                                  Value, DAG.getConstant(i, MVT::i32));
 766
 767       Stores[i] = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other,
 768                               Chain, Elem, Ptr,
 769                               DAG.getTargetConstant(Channel, MVT::i32));
 770     }
 771      Chain =  DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores, NumElemVT);
 772    } else {
 773     if (ValueVT == MVT::i8) {
 774       Value = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Value);
 775     }
 776     Chain = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other, Chain, Value, Ptr,
 777     DAG.getTargetConstant(0, MVT::i32)); // Channel
 778   }
 779
 780   return Chain;
 781 }
 782
 783 // return (512 + (kc_bank << 12)
 784 static int
 785 ConstantAddressBlock(unsigned AddressSpace) {
 786   switch (AddressSpace) {
 787   case AMDGPUAS::CONSTANT_BUFFER_0:
 788     return 512;
 789   case AMDGPUAS::CONSTANT_BUFFER_1:
 790     return 512 + 4096;
 791   case AMDGPUAS::CONSTANT_BUFFER_2:
 792     return 512 + 4096 * 2;
 793   case AMDGPUAS::CONSTANT_BUFFER_3:
 794     return 512 + 4096 * 3;
 795   case AMDGPUAS::CONSTANT_BUFFER_4:
 796     return 512 + 4096 * 4;
 797   case AMDGPUAS::CONSTANT_BUFFER_5:
 798     return 512 + 4096 * 5;
 799   case AMDGPUAS::CONSTANT_BUFFER_6:
 800     return 512 + 4096 * 6;
 801   case AMDGPUAS::CONSTANT_BUFFER_7:
 802     return 512 + 4096 * 7;
 803   case AMDGPUAS::CONSTANT_BUFFER_8:
 804     return 512 + 4096 * 8;
 805   case AMDGPUAS::CONSTANT_BUFFER_9:
 806     return 512 + 4096 * 9;
 807   case AMDGPUAS::CONSTANT_BUFFER_10:
 808     return 512 + 4096 * 10;
 809   case AMDGPUAS::CONSTANT_BUFFER_11:
 810     return 512 + 4096 * 11;
 811   case AMDGPUAS::CONSTANT_BUFFER_12:
 812     return 512 + 4096 * 12;
 813   case AMDGPUAS::CONSTANT_BUFFER_13:
 814     return 512 + 4096 * 13;
 815   case AMDGPUAS::CONSTANT_BUFFER_14:
 816     return 512 + 4096 * 14;
 817   case AMDGPUAS::CONSTANT_BUFFER_15:
 818     return 512 + 4096 * 15;
 819   default:
 820     return -1;
 821   }
 822 }
 823
 824 SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const
 825 {
 826   EVT VT = Op.getValueType();
 827   DebugLoc DL = Op.getDebugLoc();
 828   LoadSDNode *LoadNode = cast<LoadSDNode>(Op);
 829   SDValue Chain = Op.getOperand(0);
 830   SDValue Ptr = Op.getOperand(1);
 831   SDValue LoweredLoad;
 832
 833   int ConstantBlock = ConstantAddressBlock(LoadNode->getAddressSpace());
 834   if (ConstantBlock > -1) {
 835     SDValue Result;
 836     if (dyn_cast<ConstantExpr>(LoadNode->getSrcValue()) ||
 837         dyn_cast<Constant>(LoadNode->getSrcValue()) ||
 838         dyn_cast<ConstantSDNode>(Ptr)) {
 839       SDValue Slots[4];
 840       for (unsigned i = 0; i < 4; i++) {
 841         // We want Const position encoded with the following formula :
 842         // (((512 + (kc_bank << 12) + const_index) << 2) + chan)
 843         // const_index is Ptr computed by llvm using an alignment of 16.
 844         // Thus we add (((512 + (kc_bank << 12)) + chan ) * 4 here and
 845         // then div by 4 at the ISel step
 846         SDValue NewPtr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
 847             DAG.getConstant(4 * i + ConstantBlock * 16, MVT::i32));
 848         Slots[i] = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::i32, NewPtr);
 849       }
 850       Result = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v4i32, Slots, 4);
 851     } else {
 852       // non constant ptr cant be folded, keeps it as a v4f32 load
 853       Result = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::v4i32,
 854           DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr, DAG.getConstant(4, MVT::i32)),
 855           DAG.getConstant(LoadNode->getAddressSpace() -
 856                           AMDGPUAS::CONSTANT_BUFFER_0, MVT::i32)
 857           );
 858     }
 859
 860     if (!VT.isVector()) {
 861       Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Result,
 862           DAG.getConstant(0, MVT::i32));
 863     }
 864
 865     SDValue MergedValues[2] = {
 866         Result,
 867         Chain
 868     };
 869     return DAG.getMergeValues(MergedValues, 2, DL);
 870   }
 871
 872   if (LoadNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) {
 873     return SDValue();
 874   }
 875
 876   // Lowering for indirect addressing
 877   const MachineFunction &MF = DAG.getMachineFunction();
 878   const AMDGPUFrameLowering *TFL = static_cast<const AMDGPUFrameLowering*>(
 879                                          getTargetMachine().getFrameLowering());
 880   unsigned StackWidth = TFL->getStackWidth(MF);
 881
 882   Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG);
 883
 884   if (VT.isVector()) {
 885     unsigned NumElemVT = VT.getVectorNumElements();
 886     EVT ElemVT = VT.getVectorElementType();
 887     SDValue Loads[4];
 888
 889     assert(NumElemVT >= StackWidth && "Stack width cannot be greater than "
 890                                       "vector width in load");
 891
 892     for (unsigned i = 0; i < NumElemVT; ++i) {
 893       unsigned Channel, PtrIncr;
 894       getStackAddress(StackWidth, i, Channel, PtrIncr);
 895       Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr,
 896                         DAG.getConstant(PtrIncr, MVT::i32));
 897       Loads[i] = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, ElemVT,
 898                              Chain, Ptr,
 899                              DAG.getTargetConstant(Channel, MVT::i32),
 900                              Op.getOperand(2));
 901     }
 902     for (unsigned i = NumElemVT; i < 4; ++i) {
 903       Loads[i] = DAG.getUNDEF(ElemVT);
 904     }
 905     EVT TargetVT = EVT::getVectorVT(*DAG.getContext(), ElemVT, 4);
 906     LoweredLoad = DAG.getNode(ISD::BUILD_VECTOR, DL, TargetVT, Loads, 4);
 907   } else {
 908     LoweredLoad = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, VT,
 909                               Chain, Ptr,
 910                               DAG.getTargetConstant(0, MVT::i32), // Channel
 911                               Op.getOperand(2));
 912   }
 913
 914   SDValue Ops[2];
 915   Ops[0] = LoweredLoad;
 916   Ops[1] = Chain;
 917
 918   return DAG.getMergeValues(Ops, 2, DL);
 919 }
 920
 921 SDValue R600TargetLowering::LowerFPOW(SDValue Op,
 922     SelectionDAG &DAG) const {
 923   DebugLoc DL = Op.getDebugLoc();
 924   EVT VT = Op.getValueType();
 925   SDValue LogBase = DAG.getNode(ISD::FLOG2, DL, VT, Op.getOperand(0));
 926   SDValue MulLogBase = DAG.getNode(ISD::FMUL, DL, VT, Op.getOperand(1), LogBase);
 927   return DAG.getNode(ISD::FEXP2, DL, VT, MulLogBase);
 928 }
 929
 930 /// XXX Only kernel functions are supported, so we can assume for now that
 931 /// every function is a kernel function, but in the future we should use
 932 /// separate calling conventions for kernel and non-kernel functions.
 933 SDValue R600TargetLowering::LowerFormalArguments(
 934                                       SDValue Chain,
 935                                       CallingConv::ID CallConv,
 936                                       bool isVarArg,
 937                                       const SmallVectorImpl<ISD::InputArg> &Ins,
 938                                       DebugLoc DL, SelectionDAG &DAG,
 939                                       SmallVectorImpl<SDValue> &InVals) const {
 940   unsigned ParamOffsetBytes = 36;
 941   Function::const_arg_iterator FuncArg =
 942                             DAG.getMachineFunction().getFunction()->arg_begin();
 943   for (unsigned i = 0, e = Ins.size(); i < e; ++i, ++FuncArg) {
 944     EVT VT = Ins[i].VT;
 945     Type *ArgType = FuncArg->getType();
 946     unsigned ArgSizeInBits = ArgType->isPointerTy() ?
 947                              32 : ArgType->getPrimitiveSizeInBits();
 948     unsigned ArgBytes = ArgSizeInBits >> 3;
 949     EVT ArgVT;
 950     if (ArgSizeInBits < VT.getSizeInBits()) {
 951       assert(!ArgType->isFloatTy() &&
 952              "Extending floating point arguments not supported yet");
 953       ArgVT = MVT::getIntegerVT(ArgSizeInBits);
 954     } else {
 955       ArgVT = VT;
 956     }
 957     PointerType *PtrTy = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
 958                                                     AMDGPUAS::PARAM_I_ADDRESS);
 959     SDValue Arg = DAG.getExtLoad(ISD::ZEXTLOAD, DL, VT, DAG.getRoot(),
 960                                 DAG.getConstant(ParamOffsetBytes, MVT::i32),
 961                                        MachinePointerInfo(UndefValue::get(PtrTy)),
 962                                        ArgVT, false, false, ArgBytes);
 963     InVals.push_back(Arg);
 964     ParamOffsetBytes += ArgBytes;
 965   }
 966   return Chain;
 967 }
 968
 969 EVT R600TargetLowering::getSetCCResultType(EVT VT) const {
 970    if (!VT.isVector()) return MVT::i32;
 971    return VT.changeVectorElementTypeToInteger();
 972 }
 973
 974 //===----------------------------------------------------------------------===//
 975 // Custom DAG Optimizations
 976 //===----------------------------------------------------------------------===//
 977
 978 SDValue R600TargetLowering::PerformDAGCombine(SDNode *N,
 979                                               DAGCombinerInfo &DCI) const {
 980   SelectionDAG &DAG = DCI.DAG;
 981
 982   switch (N->getOpcode()) {
 983   // (f32 fp_round (f64 uint_to_fp a)) -> (f32 uint_to_fp a)
 984   case ISD::FP_ROUND: {
 985       SDValue Arg = N->getOperand(0);
 986       if (Arg.getOpcode() == ISD::UINT_TO_FP && Arg.getValueType() == MVT::f64) {
 987         return DAG.getNode(ISD::UINT_TO_FP, N->getDebugLoc(), N->getValueType(0),
 988                            Arg.getOperand(0));
 989       }
 990       break;
 991     }
 992
 993   // (i32 fp_to_sint (fneg (select_cc f32, f32, 1.0, 0.0 cc))) ->
 994   // (i32 select_cc f32, f32, -1, 0 cc)
 995   //
 996   // Mesa's GLSL frontend generates the above pattern a lot and we can lower
 997   // this to one of the SET*_DX10 instructions.
 998   case ISD::FP_TO_SINT: {
 999     SDValue FNeg = N->getOperand(0);
1000     if (FNeg.getOpcode() != ISD::FNEG) {
1001       return SDValue();
1002     }
1003     SDValue SelectCC = FNeg.getOperand(0);
1004     if (SelectCC.getOpcode() != ISD::SELECT_CC ||
1005         SelectCC.getOperand(0).getValueType() != MVT::f32 || // LHS
1006         SelectCC.getOperand(2).getValueType() != MVT::f32 || // True
1007         !isHWTrueValue(SelectCC.getOperand(2)) ||
1008         !isHWFalseValue(SelectCC.getOperand(3))) {
1009       return SDValue();
1010     }
1011
1012     return DAG.getNode(ISD::SELECT_CC, N->getDebugLoc(), N->getValueType(0),
1013                            SelectCC.getOperand(0), // LHS
1014                            SelectCC.getOperand(1), // RHS
1015                            DAG.getConstant(-1, MVT::i32), // True
1016                            DAG.getConstant(0, MVT::i32),  // Flase
1017                            SelectCC.getOperand(4)); // CC
1018
1019     break;
1020   }
1021   // Extract_vec (Build_vector) generated by custom lowering
1022   // also needs to be customly combined
1023   case ISD::EXTRACT_VECTOR_ELT: {
1024     SDValue Arg = N->getOperand(0);
1025     if (Arg.getOpcode() == ISD::BUILD_VECTOR) {
1026       if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
1027         unsigned Element = Const->getZExtValue();
1028         return Arg->getOperand(Element);
1029       }
1030     }
1031     if (Arg.getOpcode() == ISD::BITCAST &&
1032         Arg.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
1033       if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
1034         unsigned Element = Const->getZExtValue();
1035         return DAG.getNode(ISD::BITCAST, N->getDebugLoc(), N->getVTList(),
1036             Arg->getOperand(0).getOperand(Element));
1037       }
1038     }
1039   }
1040
1041   case ISD::SELECT_CC: {
1042     // fold selectcc (selectcc x, y, a, b, cc), b, a, b, seteq ->
1043     //      selectcc x, y, a, b, inv(cc)
1044     //
1045     // fold selectcc (selectcc x, y, a, b, cc), b, a, b, setne ->
1046     //      selectcc x, y, a, b, cc
1047     SDValue LHS = N->getOperand(0);
1048     if (LHS.getOpcode() != ISD::SELECT_CC) {
1049       return SDValue();
1050     }
1051
1052     SDValue RHS = N->getOperand(1);
1053     SDValue True = N->getOperand(2);
1054     SDValue False = N->getOperand(3);
1055     ISD::CondCode NCC = cast<CondCodeSDNode>(N->getOperand(4))->get();
1056
1057     if (LHS.getOperand(2).getNode() != True.getNode() ||
1058         LHS.getOperand(3).getNode() != False.getNode() ||
1059         RHS.getNode() != False.getNode()) {
1060       return SDValue();
1061     }
1062
1063     switch (NCC) {
1064     default: return SDValue();
1065     case ISD::SETNE: return LHS;
1066     case ISD::SETEQ: {
1067       ISD::CondCode LHSCC = cast<CondCodeSDNode>(LHS.getOperand(4))->get();
1068       LHSCC = ISD::getSetCCInverse(LHSCC,
1069                                   LHS.getOperand(0).getValueType().isInteger());
1070       return DAG.getSelectCC(N->getDebugLoc(),
1071                              LHS.getOperand(0),
1072                              LHS.getOperand(1),
1073                              LHS.getOperand(2),
1074                              LHS.getOperand(3),
1075                              LHSCC);
1076     }
1077     }
1078   }
1079   case AMDGPUISD::EXPORT: {
1080     SDValue Arg = N->getOperand(1);
1081     if (Arg.getOpcode() != ISD::BUILD_VECTOR)
1082       break;
1083     SDValue NewBldVec[4] = {
1084         DAG.getUNDEF(MVT::f32),
1085         DAG.getUNDEF(MVT::f32),
1086         DAG.getUNDEF(MVT::f32),
1087         DAG.getUNDEF(MVT::f32)
1088       };
1089     SDValue NewArgs[8] = {
1090       N->getOperand(0), // Chain
1091       SDValue(),
1092       N->getOperand(2), // ArrayBase
1093       N->getOperand(3), // Type
1094       N->getOperand(4), // SWZ_X
1095       N->getOperand(5), // SWZ_Y
1096       N->getOperand(6), // SWZ_Z
1097       N->getOperand(7) // SWZ_W
1098     };
1099     for (unsigned i = 0; i < Arg.getNumOperands(); i++) {
1100       if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Arg.getOperand(i))) {
1101         if (C->isZero()) {
1102           NewArgs[4 + i] = DAG.getConstant(4, MVT::i32); // SEL_0
1103         } else if (C->isExactlyValue(1.0)) {
1104           NewArgs[4 + i] = DAG.getConstant(5, MVT::i32); // SEL_0
1105         } else {
1106           NewBldVec[i] = Arg.getOperand(i);
1107         }
1108       } else {
1109         NewBldVec[i] = Arg.getOperand(i);
1110       }
1111     }
1112     DebugLoc DL = N->getDebugLoc();
1113     NewArgs[1] = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v4f32, NewBldVec, 4);
1114     return DAG.getNode(AMDGPUISD::EXPORT, DL, N->getVTList(), NewArgs, 8);
1115   }
1116   }
1117   return SDValue();
1118 }