lib/Target/R600/R600ISelLowering.cpp

   1 //===-- R600ISelLowering.cpp - R600 DAG Lowering Implementation -----------===//
   2 //
   3 //                     The LLVM Compiler Infrastructure
   4 //
   5 // This file is distributed under the University of Illinois Open Source
   6 // License. See LICENSE.TXT for details.
   7 //
   8 //===----------------------------------------------------------------------===//
   9 //
  10 /// \file
  11 /// \brief Custom DAG lowering for R600
  12 //
  13 //===----------------------------------------------------------------------===//
  14
  15 #include "R600ISelLowering.h"
  16 #include "AMDGPUFrameLowering.h"
  17 #include "AMDGPUIntrinsicInfo.h"
  18 #include "AMDGPUSubtarget.h"
  19 #include "R600Defines.h"
  20 #include "R600InstrInfo.h"
  21 #include "R600MachineFunctionInfo.h"
  22 #include "llvm/Analysis/ValueTracking.h"
  23 #include "llvm/CodeGen/CallingConvLower.h"
  24 #include "llvm/CodeGen/MachineFrameInfo.h"
  25 #include "llvm/CodeGen/MachineInstrBuilder.h"
  26 #include "llvm/CodeGen/MachineRegisterInfo.h"
  27 #include "llvm/CodeGen/SelectionDAG.h"
  28 #include "llvm/IR/Argument.h"
  29 #include "llvm/IR/Function.h"
  30
  31 using namespace llvm;
  32
  33 R600TargetLowering::R600TargetLowering(TargetMachine &TM) :
  34     AMDGPUTargetLowering(TM),
  35     Gen(TM.getSubtarget<AMDGPUSubtarget>().getGeneration()) {
  36   addRegisterClass(MVT::v4f32, &AMDGPU::R600_Reg128RegClass);
  37   addRegisterClass(MVT::f32, &AMDGPU::R600_Reg32RegClass);
  38   addRegisterClass(MVT::v4i32, &AMDGPU::R600_Reg128RegClass);
  39   addRegisterClass(MVT::i32, &AMDGPU::R600_Reg32RegClass);
  40   addRegisterClass(MVT::v2f32, &AMDGPU::R600_Reg64RegClass);
  41   addRegisterClass(MVT::v2i32, &AMDGPU::R600_Reg64RegClass);
  42
  43   computeRegisterProperties();
  44
  45   // Set condition code actions
  46   setCondCodeAction(ISD::SETO,   MVT::f32, Expand);
  47   setCondCodeAction(ISD::SETUO,  MVT::f32, Expand);
  48   setCondCodeAction(ISD::SETLT,  MVT::f32, Expand);
  49   setCondCodeAction(ISD::SETLE,  MVT::f32, Expand);
  50   setCondCodeAction(ISD::SETOLT, MVT::f32, Expand);
  51   setCondCodeAction(ISD::SETOLE, MVT::f32, Expand);
  52   setCondCodeAction(ISD::SETONE, MVT::f32, Expand);
  53   setCondCodeAction(ISD::SETUEQ, MVT::f32, Expand);
  54   setCondCodeAction(ISD::SETUGE, MVT::f32, Expand);
  55   setCondCodeAction(ISD::SETUGT, MVT::f32, Expand);
  56   setCondCodeAction(ISD::SETULT, MVT::f32, Expand);
  57   setCondCodeAction(ISD::SETULE, MVT::f32, Expand);
  58
  59   setCondCodeAction(ISD::SETLE, MVT::i32, Expand);
  60   setCondCodeAction(ISD::SETLT, MVT::i32, Expand);
  61   setCondCodeAction(ISD::SETULE, MVT::i32, Expand);
  62   setCondCodeAction(ISD::SETULT, MVT::i32, Expand);
  63
  64   setOperationAction(ISD::FCOS, MVT::f32, Custom);
  65   setOperationAction(ISD::FSIN, MVT::f32, Custom);
  66
  67   setOperationAction(ISD::SETCC, MVT::v4i32, Expand);
  68   setOperationAction(ISD::SETCC, MVT::v2i32, Expand);
  69
  70   setOperationAction(ISD::BR_CC, MVT::i32, Expand);
  71   setOperationAction(ISD::BR_CC, MVT::f32, Expand);
  72   setOperationAction(ISD::BRCOND, MVT::Other, Custom);
  73
  74   setOperationAction(ISD::FSUB, MVT::f32, Expand);
  75
  76   setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
  77   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
  78   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i1, Custom);
  79
  80   setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
  81   setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
  82
  83   setOperationAction(ISD::SETCC, MVT::i32, Expand);
  84   setOperationAction(ISD::SETCC, MVT::f32, Expand);
  85   setOperationAction(ISD::FP_TO_UINT, MVT::i1, Custom);
  86   setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
  87   setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);
  88
  89   setOperationAction(ISD::SELECT, MVT::i32, Expand);
  90   setOperationAction(ISD::SELECT, MVT::f32, Expand);
  91   setOperationAction(ISD::SELECT, MVT::v2i32, Expand);
  92   setOperationAction(ISD::SELECT, MVT::v4i32, Expand);
  93
  94   // Expand sign extension of vectors
  95   if (!Subtarget->hasBFE())
  96     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
  97
  98   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i1, Expand);
  99   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i1, Expand);
 100
 101   if (!Subtarget->hasBFE())
 102     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8, Expand);
 103   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i8, Expand);
 104   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i8, Expand);
 105
 106   if (!Subtarget->hasBFE())
 107     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Expand);
 108   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Expand);
 109   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Expand);
 110
 111   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
 112   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i32, Expand);
 113   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i32, Expand);
 114
 115   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::Other, Expand);
 116
 117
 118   // Legalize loads and stores to the private address space.
 119   setOperationAction(ISD::LOAD, MVT::i32, Custom);
 120   setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
 121   setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
 122
 123   // EXTLOAD should be the same as ZEXTLOAD. It is legal for some address
 124   // spaces, so it is custom lowered to handle those where it isn't.
 125   for (MVT VT : MVT::integer_valuetypes()) {
 126     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
 127     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i8, Custom);
 128     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i16, Custom);
 129
 130     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote);
 131     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i8, Custom);
 132     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i16, Custom);
 133
 134     setLoadExtAction(ISD::EXTLOAD, VT, MVT::i1, Promote);
 135     setLoadExtAction(ISD::EXTLOAD, VT, MVT::i8, Custom);
 136     setLoadExtAction(ISD::EXTLOAD, VT, MVT::i16, Custom);
 137   }
 138
 139   setOperationAction(ISD::STORE, MVT::i8, Custom);
 140   setOperationAction(ISD::STORE, MVT::i32, Custom);
 141   setOperationAction(ISD::STORE, MVT::v2i32, Custom);
 142   setOperationAction(ISD::STORE, MVT::v4i32, Custom);
 143   setTruncStoreAction(MVT::i32, MVT::i8, Custom);
 144   setTruncStoreAction(MVT::i32, MVT::i16, Custom);
 145
 146   setOperationAction(ISD::LOAD, MVT::i32, Custom);
 147   setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
 148   setOperationAction(ISD::FrameIndex, MVT::i32, Custom);
 149
 150   setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i32, Custom);
 151   setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f32, Custom);
 152   setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom);
 153   setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
 154
 155   setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i32, Custom);
 156   setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f32, Custom);
 157   setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom);
 158   setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom);
 159
 160   setTargetDAGCombine(ISD::FP_ROUND);
 161   setTargetDAGCombine(ISD::FP_TO_SINT);
 162   setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
 163   setTargetDAGCombine(ISD::SELECT_CC);
 164   setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
 165
 166   setOperationAction(ISD::SUB, MVT::i64, Expand);
 167
 168   // These should be replaced by UDVIREM, but it does not happen automatically
 169   // during Type Legalization
 170   setOperationAction(ISD::UDIV, MVT::i64, Custom);
 171   setOperationAction(ISD::UREM, MVT::i64, Custom);
 172   setOperationAction(ISD::SDIV, MVT::i64, Custom);
 173   setOperationAction(ISD::SREM, MVT::i64, Custom);
 174
 175   // We don't have 64-bit shifts. Thus we need either SHX i64 or SHX_PARTS i32
 176   //  to be Legal/Custom in order to avoid library calls.
 177   setOperationAction(ISD::SHL_PARTS, MVT::i32, Custom);
 178   setOperationAction(ISD::SRL_PARTS, MVT::i32, Custom);
 179   setOperationAction(ISD::SRA_PARTS, MVT::i32, Custom);
 180
 181   setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
 182
 183   const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 };
 184   for (MVT VT : ScalarIntVTs) {
 185     setOperationAction(ISD::ADDC, VT, Expand);
 186     setOperationAction(ISD::SUBC, VT, Expand);
 187     setOperationAction(ISD::ADDE, VT, Expand);
 188     setOperationAction(ISD::SUBE, VT, Expand);
 189   }
 190
 191   setSchedulingPreference(Sched::Source);
 192 }
 193
 194 MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter(
 195     MachineInstr * MI, MachineBasicBlock * BB) const {
 196   MachineFunction * MF = BB->getParent();
 197   MachineRegisterInfo &MRI = MF->getRegInfo();
 198   MachineBasicBlock::iterator I = *MI;
 199   const R600InstrInfo *TII =
 200       static_cast<const R600InstrInfo *>(MF->getSubtarget().getInstrInfo());
 201
 202   switch (MI->getOpcode()) {
 203   default:
 204     // Replace LDS_*_RET instruction that don't have any uses with the
 205     // equivalent LDS_*_NORET instruction.
 206     if (TII->isLDSRetInstr(MI->getOpcode())) {
 207       int DstIdx = TII->getOperandIdx(MI->getOpcode(), AMDGPU::OpName::dst);
 208       assert(DstIdx != -1);
 209       MachineInstrBuilder NewMI;
 210       // FIXME: getLDSNoRetOp method only handles LDS_1A1D LDS ops. Add
 211       //        LDS_1A2D support and remove this special case.
 212       if (!MRI.use_empty(MI->getOperand(DstIdx).getReg()) ||
 213            MI->getOpcode() == AMDGPU::LDS_CMPST_RET)
 214         return BB;
 215
 216       NewMI = BuildMI(*BB, I, BB->findDebugLoc(I),
 217                       TII->get(AMDGPU::getLDSNoRetOp(MI->getOpcode())));
 218       for (unsigned i = 1, e = MI->getNumOperands(); i < e; ++i) {
 219         NewMI.addOperand(MI->getOperand(i));
 220       }
 221     } else {
 222       return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
 223     }
 224     break;
 225   case AMDGPU::CLAMP_R600: {
 226     MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
 227                                                    AMDGPU::MOV,
 228                                                    MI->getOperand(0).getReg(),
 229                                                    MI->getOperand(1).getReg());
 230     TII->addFlag(NewMI, 0, MO_FLAG_CLAMP);
 231     break;
 232   }
 233
 234   case AMDGPU::FABS_R600: {
 235     MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
 236                                                     AMDGPU::MOV,
 237                                                     MI->getOperand(0).getReg(),
 238                                                     MI->getOperand(1).getReg());
 239     TII->addFlag(NewMI, 0, MO_FLAG_ABS);
 240     break;
 241   }
 242
 243   case AMDGPU::FNEG_R600: {
 244     MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
 245                                                     AMDGPU::MOV,
 246                                                     MI->getOperand(0).getReg(),
 247                                                     MI->getOperand(1).getReg());
 248     TII->addFlag(NewMI, 0, MO_FLAG_NEG);
 249     break;
 250   }
 251
 252   case AMDGPU::MASK_WRITE: {
 253     unsigned maskedRegister = MI->getOperand(0).getReg();
 254     assert(TargetRegisterInfo::isVirtualRegister(maskedRegister));
 255     MachineInstr * defInstr = MRI.getVRegDef(maskedRegister);
 256     TII->addFlag(defInstr, 0, MO_FLAG_MASK);
 257     break;
 258   }
 259
 260   case AMDGPU::MOV_IMM_F32:
 261     TII->buildMovImm(*BB, I, MI->getOperand(0).getReg(),
 262                      MI->getOperand(1).getFPImm()->getValueAPF()
 263                          .bitcastToAPInt().getZExtValue());
 264     break;
 265   case AMDGPU::MOV_IMM_I32:
 266     TII->buildMovImm(*BB, I, MI->getOperand(0).getReg(),
 267                      MI->getOperand(1).getImm());
 268     break;
 269   case AMDGPU::CONST_COPY: {
 270     MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, MI, AMDGPU::MOV,
 271         MI->getOperand(0).getReg(), AMDGPU::ALU_CONST);
 272     TII->setImmOperand(NewMI, AMDGPU::OpName::src0_sel,
 273         MI->getOperand(1).getImm());
 274     break;
 275   }
 276
 277   case AMDGPU::RAT_WRITE_CACHELESS_32_eg:
 278   case AMDGPU::RAT_WRITE_CACHELESS_64_eg:
 279   case AMDGPU::RAT_WRITE_CACHELESS_128_eg: {
 280     unsigned EOP = (std::next(I)->getOpcode() == AMDGPU::RETURN) ? 1 : 0;
 281
 282     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode()))
 283             .addOperand(MI->getOperand(0))
 284             .addOperand(MI->getOperand(1))
 285             .addImm(EOP); // Set End of program bit
 286     break;
 287   }
 288
 289   case AMDGPU::TXD: {
 290     unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
 291     unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
 292     MachineOperand &RID = MI->getOperand(4);
 293     MachineOperand &SID = MI->getOperand(5);
 294     unsigned TextureId = MI->getOperand(6).getImm();
 295     unsigned SrcX = 0, SrcY = 1, SrcZ = 2, SrcW = 3;
 296     unsigned CTX = 1, CTY = 1, CTZ = 1, CTW = 1;
 297
 298     switch (TextureId) {
 299     case 5: // Rect
 300       CTX = CTY = 0;
 301       break;
 302     case 6: // Shadow1D
 303       SrcW = SrcZ;
 304       break;
 305     case 7: // Shadow2D
 306       SrcW = SrcZ;
 307       break;
 308     case 8: // ShadowRect
 309       CTX = CTY = 0;
 310       SrcW = SrcZ;
 311       break;
 312     case 9: // 1DArray
 313       SrcZ = SrcY;
 314       CTZ = 0;
 315       break;
 316     case 10: // 2DArray
 317       CTZ = 0;
 318       break;
 319     case 11: // Shadow1DArray
 320       SrcZ = SrcY;
 321       CTZ = 0;
 322       break;
 323     case 12: // Shadow2DArray
 324       CTZ = 0;
 325       break;
 326     }
 327     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), T0)
 328             .addOperand(MI->getOperand(3))
 329             .addImm(SrcX)
 330             .addImm(SrcY)
 331             .addImm(SrcZ)
 332             .addImm(SrcW)
 333             .addImm(0)
 334             .addImm(0)
 335             .addImm(0)
 336             .addImm(0)
 337             .addImm(1)
 338             .addImm(2)
 339             .addImm(3)
 340             .addOperand(RID)
 341             .addOperand(SID)
 342             .addImm(CTX)
 343             .addImm(CTY)
 344             .addImm(CTZ)
 345             .addImm(CTW);
 346     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), T1)
 347             .addOperand(MI->getOperand(2))
 348             .addImm(SrcX)
 349             .addImm(SrcY)
 350             .addImm(SrcZ)
 351             .addImm(SrcW)
 352             .addImm(0)
 353             .addImm(0)
 354             .addImm(0)
 355             .addImm(0)
 356             .addImm(1)
 357             .addImm(2)
 358             .addImm(3)
 359             .addOperand(RID)
 360             .addOperand(SID)
 361             .addImm(CTX)
 362             .addImm(CTY)
 363             .addImm(CTZ)
 364             .addImm(CTW);
 365     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_G))
 366             .addOperand(MI->getOperand(0))
 367             .addOperand(MI->getOperand(1))
 368             .addImm(SrcX)
 369             .addImm(SrcY)
 370             .addImm(SrcZ)
 371             .addImm(SrcW)
 372             .addImm(0)
 373             .addImm(0)
 374             .addImm(0)
 375             .addImm(0)
 376             .addImm(1)
 377             .addImm(2)
 378             .addImm(3)
 379             .addOperand(RID)
 380             .addOperand(SID)
 381             .addImm(CTX)
 382             .addImm(CTY)
 383             .addImm(CTZ)
 384             .addImm(CTW)
 385             .addReg(T0, RegState::Implicit)
 386             .addReg(T1, RegState::Implicit);
 387     break;
 388   }
 389
 390   case AMDGPU::TXD_SHADOW: {
 391     unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
 392     unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
 393     MachineOperand &RID = MI->getOperand(4);
 394     MachineOperand &SID = MI->getOperand(5);
 395     unsigned TextureId = MI->getOperand(6).getImm();
 396     unsigned SrcX = 0, SrcY = 1, SrcZ = 2, SrcW = 3;
 397     unsigned CTX = 1, CTY = 1, CTZ = 1, CTW = 1;
 398
 399     switch (TextureId) {
 400     case 5: // Rect
 401       CTX = CTY = 0;
 402       break;
 403     case 6: // Shadow1D
 404       SrcW = SrcZ;
 405       break;
 406     case 7: // Shadow2D
 407       SrcW = SrcZ;
 408       break;
 409     case 8: // ShadowRect
 410       CTX = CTY = 0;
 411       SrcW = SrcZ;
 412       break;
 413     case 9: // 1DArray
 414       SrcZ = SrcY;
 415       CTZ = 0;
 416       break;
 417     case 10: // 2DArray
 418       CTZ = 0;
 419       break;
 420     case 11: // Shadow1DArray
 421       SrcZ = SrcY;
 422       CTZ = 0;
 423       break;
 424     case 12: // Shadow2DArray
 425       CTZ = 0;
 426       break;
 427     }
 428
 429     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), T0)
 430             .addOperand(MI->getOperand(3))
 431             .addImm(SrcX)
 432             .addImm(SrcY)
 433             .addImm(SrcZ)
 434             .addImm(SrcW)
 435             .addImm(0)
 436             .addImm(0)
 437             .addImm(0)
 438             .addImm(0)
 439             .addImm(1)
 440             .addImm(2)
 441             .addImm(3)
 442             .addOperand(RID)
 443             .addOperand(SID)
 444             .addImm(CTX)
 445             .addImm(CTY)
 446             .addImm(CTZ)
 447             .addImm(CTW);
 448     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), T1)
 449             .addOperand(MI->getOperand(2))
 450             .addImm(SrcX)
 451             .addImm(SrcY)
 452             .addImm(SrcZ)
 453             .addImm(SrcW)
 454             .addImm(0)
 455             .addImm(0)
 456             .addImm(0)
 457             .addImm(0)
 458             .addImm(1)
 459             .addImm(2)
 460             .addImm(3)
 461             .addOperand(RID)
 462             .addOperand(SID)
 463             .addImm(CTX)
 464             .addImm(CTY)
 465             .addImm(CTZ)
 466             .addImm(CTW);
 467     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_C_G))
 468             .addOperand(MI->getOperand(0))
 469             .addOperand(MI->getOperand(1))
 470             .addImm(SrcX)
 471             .addImm(SrcY)
 472             .addImm(SrcZ)
 473             .addImm(SrcW)
 474             .addImm(0)
 475             .addImm(0)
 476             .addImm(0)
 477             .addImm(0)
 478             .addImm(1)
 479             .addImm(2)
 480             .addImm(3)
 481             .addOperand(RID)
 482             .addOperand(SID)
 483             .addImm(CTX)
 484             .addImm(CTY)
 485             .addImm(CTZ)
 486             .addImm(CTW)
 487             .addReg(T0, RegState::Implicit)
 488             .addReg(T1, RegState::Implicit);
 489     break;
 490   }
 491
 492   case AMDGPU::BRANCH:
 493       BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP))
 494               .addOperand(MI->getOperand(0));
 495       break;
 496
 497   case AMDGPU::BRANCH_COND_f32: {
 498     MachineInstr *NewMI =
 499       BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X),
 500               AMDGPU::PREDICATE_BIT)
 501               .addOperand(MI->getOperand(1))
 502               .addImm(OPCODE_IS_NOT_ZERO)
 503               .addImm(0); // Flags
 504     TII->addFlag(NewMI, 0, MO_FLAG_PUSH);
 505     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND))
 506             .addOperand(MI->getOperand(0))
 507             .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
 508     break;
 509   }
 510
 511   case AMDGPU::BRANCH_COND_i32: {
 512     MachineInstr *NewMI =
 513       BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X),
 514             AMDGPU::PREDICATE_BIT)
 515             .addOperand(MI->getOperand(1))
 516             .addImm(OPCODE_IS_NOT_ZERO_INT)
 517             .addImm(0); // Flags
 518     TII->addFlag(NewMI, 0, MO_FLAG_PUSH);
 519     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND))
 520            .addOperand(MI->getOperand(0))
 521             .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
 522     break;
 523   }
 524
 525   case AMDGPU::EG_ExportSwz:
 526   case AMDGPU::R600_ExportSwz: {
 527     // Instruction is left unmodified if its not the last one of its type
 528     bool isLastInstructionOfItsType = true;
 529     unsigned InstExportType = MI->getOperand(1).getImm();
 530     for (MachineBasicBlock::iterator NextExportInst = std::next(I),
 531          EndBlock = BB->end(); NextExportInst != EndBlock;
 532          NextExportInst = std::next(NextExportInst)) {
 533       if (NextExportInst->getOpcode() == AMDGPU::EG_ExportSwz ||
 534           NextExportInst->getOpcode() == AMDGPU::R600_ExportSwz) {
 535         unsigned CurrentInstExportType = NextExportInst->getOperand(1)
 536             .getImm();
 537         if (CurrentInstExportType == InstExportType) {
 538           isLastInstructionOfItsType = false;
 539           break;
 540         }
 541       }
 542     }
 543     bool EOP = (std::next(I)->getOpcode() == AMDGPU::RETURN) ? 1 : 0;
 544     if (!EOP && !isLastInstructionOfItsType)
 545       return BB;
 546     unsigned CfInst = (MI->getOpcode() == AMDGPU::EG_ExportSwz)? 84 : 40;
 547     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode()))
 548             .addOperand(MI->getOperand(0))
 549             .addOperand(MI->getOperand(1))
 550             .addOperand(MI->getOperand(2))
 551             .addOperand(MI->getOperand(3))
 552             .addOperand(MI->getOperand(4))
 553             .addOperand(MI->getOperand(5))
 554             .addOperand(MI->getOperand(6))
 555             .addImm(CfInst)
 556             .addImm(EOP);
 557     break;
 558   }
 559   case AMDGPU::RETURN: {
 560     // RETURN instructions must have the live-out registers as implicit uses,
 561     // otherwise they appear dead.
 562     R600MachineFunctionInfo *MFI = MF->getInfo<R600MachineFunctionInfo>();
 563     MachineInstrBuilder MIB(*MF, MI);
 564     for (unsigned i = 0, e = MFI->LiveOuts.size(); i != e; ++i)
 565       MIB.addReg(MFI->LiveOuts[i], RegState::Implicit);
 566     return BB;
 567   }
 568   }
 569
 570   MI->eraseFromParent();
 571   return BB;
 572 }
 573
 574 //===----------------------------------------------------------------------===//
 575 // Custom DAG Lowering Operations
 576 //===----------------------------------------------------------------------===//
 577
 578 SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
 579   MachineFunction &MF = DAG.getMachineFunction();
 580   R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
 581   switch (Op.getOpcode()) {
 582   default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
 583   case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
 584   case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
 585   case ISD::SHL_PARTS: return LowerSHLParts(Op, DAG);
 586   case ISD::SRA_PARTS:
 587   case ISD::SRL_PARTS: return LowerSRXParts(Op, DAG);
 588   case ISD::FCOS:
 589   case ISD::FSIN: return LowerTrig(Op, DAG);
 590   case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
 591   case ISD::STORE: return LowerSTORE(Op, DAG);
 592   case ISD::LOAD: {
 593     SDValue Result = LowerLOAD(Op, DAG);
 594     assert((!Result.getNode() ||
 595             Result.getNode()->getNumValues() == 2) &&
 596            "Load should return a value and a chain");
 597     return Result;
 598   }
 599
 600   case ISD::BRCOND: return LowerBRCOND(Op, DAG);
 601   case ISD::GlobalAddress: return LowerGlobalAddress(MFI, Op, DAG);
 602   case ISD::INTRINSIC_VOID: {
 603     SDValue Chain = Op.getOperand(0);
 604     unsigned IntrinsicID =
 605                          cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
 606     switch (IntrinsicID) {
 607     case AMDGPUIntrinsic::AMDGPU_store_output: {
 608       int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
 609       unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister(RegIndex);
 610       MFI->LiveOuts.push_back(Reg);
 611       return DAG.getCopyToReg(Chain, SDLoc(Op), Reg, Op.getOperand(2));
 612     }
 613     case AMDGPUIntrinsic::R600_store_swizzle: {
 614       const SDValue Args[8] = {
 615         Chain,
 616         Op.getOperand(2), // Export Value
 617         Op.getOperand(3), // ArrayBase
 618         Op.getOperand(4), // Type
 619         DAG.getConstant(0, MVT::i32), // SWZ_X
 620         DAG.getConstant(1, MVT::i32), // SWZ_Y
 621         DAG.getConstant(2, MVT::i32), // SWZ_Z
 622         DAG.getConstant(3, MVT::i32) // SWZ_W
 623       };
 624       return DAG.getNode(AMDGPUISD::EXPORT, SDLoc(Op), Op.getValueType(), Args);
 625     }
 626
 627     // default for switch(IntrinsicID)
 628     default: break;
 629     }
 630     // break out of case ISD::INTRINSIC_VOID in switch(Op.getOpcode())
 631     break;
 632   }
 633   case ISD::INTRINSIC_WO_CHAIN: {
 634     unsigned IntrinsicID =
 635                          cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
 636     EVT VT = Op.getValueType();
 637     SDLoc DL(Op);
 638     switch(IntrinsicID) {
 639     default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
 640     case AMDGPUIntrinsic::R600_load_input: {
 641       int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
 642       unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister(RegIndex);
 643       MachineFunction &MF = DAG.getMachineFunction();
 644       MachineRegisterInfo &MRI = MF.getRegInfo();
 645       MRI.addLiveIn(Reg);
 646       return DAG.getCopyFromReg(DAG.getEntryNode(),
 647           SDLoc(DAG.getEntryNode()), Reg, VT);
 648     }
 649
 650     case AMDGPUIntrinsic::R600_interp_input: {
 651       int slot = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
 652       int ijb = cast<ConstantSDNode>(Op.getOperand(2))->getSExtValue();
 653       MachineSDNode *interp;
 654       if (ijb < 0) {
 655         const MachineFunction &MF = DAG.getMachineFunction();
 656         const R600InstrInfo *TII = static_cast<const R600InstrInfo *>(
 657             MF.getSubtarget().getInstrInfo());
 658         interp = DAG.getMachineNode(AMDGPU::INTERP_VEC_LOAD, DL,
 659             MVT::v4f32, DAG.getTargetConstant(slot / 4 , MVT::i32));
 660         return DAG.getTargetExtractSubreg(
 661             TII->getRegisterInfo().getSubRegFromChannel(slot % 4),
 662             DL, MVT::f32, SDValue(interp, 0));
 663       }
 664       MachineFunction &MF = DAG.getMachineFunction();
 665       MachineRegisterInfo &MRI = MF.getRegInfo();
 666       unsigned RegisterI = AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb);
 667       unsigned RegisterJ = AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb + 1);
 668       MRI.addLiveIn(RegisterI);
 669       MRI.addLiveIn(RegisterJ);
 670       SDValue RegisterINode = DAG.getCopyFromReg(DAG.getEntryNode(),
 671           SDLoc(DAG.getEntryNode()), RegisterI, MVT::f32);
 672       SDValue RegisterJNode = DAG.getCopyFromReg(DAG.getEntryNode(),
 673           SDLoc(DAG.getEntryNode()), RegisterJ, MVT::f32);
 674
 675       if (slot % 4 < 2)
 676         interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_XY, DL,
 677             MVT::f32, MVT::f32, DAG.getTargetConstant(slot / 4 , MVT::i32),
 678             RegisterJNode, RegisterINode);
 679       else
 680         interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_ZW, DL,
 681             MVT::f32, MVT::f32, DAG.getTargetConstant(slot / 4 , MVT::i32),
 682             RegisterJNode, RegisterINode);
 683       return SDValue(interp, slot % 2);
 684     }
 685     case AMDGPUIntrinsic::R600_interp_xy:
 686     case AMDGPUIntrinsic::R600_interp_zw: {
 687       int slot = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
 688       MachineSDNode *interp;
 689       SDValue RegisterINode = Op.getOperand(2);
 690       SDValue RegisterJNode = Op.getOperand(3);
 691
 692       if (IntrinsicID == AMDGPUIntrinsic::R600_interp_xy)
 693         interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_XY, DL,
 694             MVT::f32, MVT::f32, DAG.getTargetConstant(slot, MVT::i32),
 695             RegisterJNode, RegisterINode);
 696       else
 697         interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_ZW, DL,
 698             MVT::f32, MVT::f32, DAG.getTargetConstant(slot, MVT::i32),
 699             RegisterJNode, RegisterINode);
 700       return DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2f32,
 701           SDValue(interp, 0), SDValue(interp, 1));
 702     }
 703     case AMDGPUIntrinsic::R600_tex:
 704     case AMDGPUIntrinsic::R600_texc:
 705     case AMDGPUIntrinsic::R600_txl:
 706     case AMDGPUIntrinsic::R600_txlc:
 707     case AMDGPUIntrinsic::R600_txb:
 708     case AMDGPUIntrinsic::R600_txbc:
 709     case AMDGPUIntrinsic::R600_txf:
 710     case AMDGPUIntrinsic::R600_txq:
 711     case AMDGPUIntrinsic::R600_ddx:
 712     case AMDGPUIntrinsic::R600_ddy:
 713     case AMDGPUIntrinsic::R600_ldptr: {
 714       unsigned TextureOp;
 715       switch (IntrinsicID) {
 716       case AMDGPUIntrinsic::R600_tex:
 717         TextureOp = 0;
 718         break;
 719       case AMDGPUIntrinsic::R600_texc:
 720         TextureOp = 1;
 721         break;
 722       case AMDGPUIntrinsic::R600_txl:
 723         TextureOp = 2;
 724         break;
 725       case AMDGPUIntrinsic::R600_txlc:
 726         TextureOp = 3;
 727         break;
 728       case AMDGPUIntrinsic::R600_txb:
 729         TextureOp = 4;
 730         break;
 731       case AMDGPUIntrinsic::R600_txbc:
 732         TextureOp = 5;
 733         break;
 734       case AMDGPUIntrinsic::R600_txf:
 735         TextureOp = 6;
 736         break;
 737       case AMDGPUIntrinsic::R600_txq:
 738         TextureOp = 7;
 739         break;
 740       case AMDGPUIntrinsic::R600_ddx:
 741         TextureOp = 8;
 742         break;
 743       case AMDGPUIntrinsic::R600_ddy:
 744         TextureOp = 9;
 745         break;
 746       case AMDGPUIntrinsic::R600_ldptr:
 747         TextureOp = 10;
 748         break;
 749       default:
 750         llvm_unreachable("Unknow Texture Operation");
 751       }
 752
 753       SDValue TexArgs[19] = {
 754         DAG.getConstant(TextureOp, MVT::i32),
 755         Op.getOperand(1),
 756         DAG.getConstant(0, MVT::i32),
 757         DAG.getConstant(1, MVT::i32),
 758         DAG.getConstant(2, MVT::i32),
 759         DAG.getConstant(3, MVT::i32),
 760         Op.getOperand(2),
 761         Op.getOperand(3),
 762         Op.getOperand(4),
 763         DAG.getConstant(0, MVT::i32),
 764         DAG.getConstant(1, MVT::i32),
 765         DAG.getConstant(2, MVT::i32),
 766         DAG.getConstant(3, MVT::i32),
 767         Op.getOperand(5),
 768         Op.getOperand(6),
 769         Op.getOperand(7),
 770         Op.getOperand(8),
 771         Op.getOperand(9),
 772         Op.getOperand(10)
 773       };
 774       return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, DL, MVT::v4f32, TexArgs);
 775     }
 776     case AMDGPUIntrinsic::AMDGPU_dp4: {
 777       SDValue Args[8] = {
 778       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
 779           DAG.getConstant(0, MVT::i32)),
 780       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
 781           DAG.getConstant(0, MVT::i32)),
 782       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
 783           DAG.getConstant(1, MVT::i32)),
 784       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
 785           DAG.getConstant(1, MVT::i32)),
 786       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
 787           DAG.getConstant(2, MVT::i32)),
 788       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
 789           DAG.getConstant(2, MVT::i32)),
 790       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
 791           DAG.getConstant(3, MVT::i32)),
 792       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
 793           DAG.getConstant(3, MVT::i32))
 794       };
 795       return DAG.getNode(AMDGPUISD::DOT4, DL, MVT::f32, Args);
 796     }
 797
 798     case Intrinsic::r600_read_ngroups_x:
 799       return LowerImplicitParameter(DAG, VT, DL, 0);
 800     case Intrinsic::r600_read_ngroups_y:
 801       return LowerImplicitParameter(DAG, VT, DL, 1);
 802     case Intrinsic::r600_read_ngroups_z:
 803       return LowerImplicitParameter(DAG, VT, DL, 2);
 804     case Intrinsic::r600_read_global_size_x:
 805       return LowerImplicitParameter(DAG, VT, DL, 3);
 806     case Intrinsic::r600_read_global_size_y:
 807       return LowerImplicitParameter(DAG, VT, DL, 4);
 808     case Intrinsic::r600_read_global_size_z:
 809       return LowerImplicitParameter(DAG, VT, DL, 5);
 810     case Intrinsic::r600_read_local_size_x:
 811       return LowerImplicitParameter(DAG, VT, DL, 6);
 812     case Intrinsic::r600_read_local_size_y:
 813       return LowerImplicitParameter(DAG, VT, DL, 7);
 814     case Intrinsic::r600_read_local_size_z:
 815       return LowerImplicitParameter(DAG, VT, DL, 8);
 816
 817     case Intrinsic::AMDGPU_read_workdim:
 818       return LowerImplicitParameter(DAG, VT, DL, MFI->ABIArgOffset / 4);
 819
 820     case Intrinsic::r600_read_tgid_x:
 821       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 822                                   AMDGPU::T1_X, VT);
 823     case Intrinsic::r600_read_tgid_y:
 824       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 825                                   AMDGPU::T1_Y, VT);
 826     case Intrinsic::r600_read_tgid_z:
 827       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 828                                   AMDGPU::T1_Z, VT);
 829     case Intrinsic::r600_read_tidig_x:
 830       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 831                                   AMDGPU::T0_X, VT);
 832     case Intrinsic::r600_read_tidig_y:
 833       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 834                                   AMDGPU::T0_Y, VT);
 835     case Intrinsic::r600_read_tidig_z:
 836       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 837                                   AMDGPU::T0_Z, VT);
 838     case Intrinsic::AMDGPU_rsq:
 839       // XXX - I'm assuming SI's RSQ_LEGACY matches R600's behavior.
 840       return DAG.getNode(AMDGPUISD::RSQ_LEGACY, DL, VT, Op.getOperand(1));
 841     }
 842     // break out of case ISD::INTRINSIC_WO_CHAIN in switch(Op.getOpcode())
 843     break;
 844   }
 845   } // end switch(Op.getOpcode())
 846   return SDValue();
 847 }
 848
 849 void R600TargetLowering::ReplaceNodeResults(SDNode *N,
 850                                             SmallVectorImpl<SDValue> &Results,
 851                                             SelectionDAG &DAG) const {
 852   switch (N->getOpcode()) {
 853   default:
 854     AMDGPUTargetLowering::ReplaceNodeResults(N, Results, DAG);
 855     return;
 856   case ISD::FP_TO_UINT:
 857     if (N->getValueType(0) == MVT::i1) {
 858       Results.push_back(LowerFPTOUINT(N->getOperand(0), DAG));
 859       return;
 860     }
 861     // Fall-through. Since we don't care about out of bounds values
 862     // we can use FP_TO_SINT for uints too. The DAGLegalizer code for uint
 863     // considers some extra cases which are not necessary here.
 864   case ISD::FP_TO_SINT: {
 865     SDValue Result;
 866     if (expandFP_TO_SINT(N, Result, DAG))
 867       Results.push_back(Result);
 868     return;
 869   }
 870   case ISD::UDIV: {
 871     SDValue Op = SDValue(N, 0);
 872     SDLoc DL(Op);
 873     EVT VT = Op.getValueType();
 874     SDValue UDIVREM = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(VT, VT),
 875       N->getOperand(0), N->getOperand(1));
 876     Results.push_back(UDIVREM);
 877     break;
 878   }
 879   case ISD::UREM: {
 880     SDValue Op = SDValue(N, 0);
 881     SDLoc DL(Op);
 882     EVT VT = Op.getValueType();
 883     SDValue UDIVREM = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(VT, VT),
 884       N->getOperand(0), N->getOperand(1));
 885     Results.push_back(UDIVREM.getValue(1));
 886     break;
 887   }
 888   case ISD::SDIV: {
 889     SDValue Op = SDValue(N, 0);
 890     SDLoc DL(Op);
 891     EVT VT = Op.getValueType();
 892     SDValue SDIVREM = DAG.getNode(ISD::SDIVREM, DL, DAG.getVTList(VT, VT),
 893       N->getOperand(0), N->getOperand(1));
 894     Results.push_back(SDIVREM);
 895     break;
 896   }
 897   case ISD::SREM: {
 898     SDValue Op = SDValue(N, 0);
 899     SDLoc DL(Op);
 900     EVT VT = Op.getValueType();
 901     SDValue SDIVREM = DAG.getNode(ISD::SDIVREM, DL, DAG.getVTList(VT, VT),
 902       N->getOperand(0), N->getOperand(1));
 903     Results.push_back(SDIVREM.getValue(1));
 904     break;
 905   }
 906   case ISD::SDIVREM: {
 907     SDValue Op = SDValue(N, 1);
 908     SDValue RES = LowerSDIVREM(Op, DAG);
 909     Results.push_back(RES);
 910     Results.push_back(RES.getValue(1));
 911     break;
 912   }
 913   case ISD::UDIVREM: {
 914     SDValue Op = SDValue(N, 0);
 915     LowerUDIVREM64(Op, DAG, Results);
 916     break;
 917   }
 918   }
 919 }
 920
 921 SDValue R600TargetLowering::vectorToVerticalVector(SelectionDAG &DAG,
 922                                                    SDValue Vector) const {
 923
 924   SDLoc DL(Vector);
 925   EVT VecVT = Vector.getValueType();
 926   EVT EltVT = VecVT.getVectorElementType();
 927   SmallVector<SDValue, 8> Args;
 928
 929   for (unsigned i = 0, e = VecVT.getVectorNumElements();
 930                                                            i != e; ++i) {
 931     Args.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT,
 932                                Vector, DAG.getConstant(i, getVectorIdxTy())));
 933   }
 934
 935   return DAG.getNode(AMDGPUISD::BUILD_VERTICAL_VECTOR, DL, VecVT, Args);
 936 }
 937
 938 SDValue R600TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
 939                                                     SelectionDAG &DAG) const {
 940
 941   SDLoc DL(Op);
 942   SDValue Vector = Op.getOperand(0);
 943   SDValue Index = Op.getOperand(1);
 944
 945   if (isa<ConstantSDNode>(Index) ||
 946       Vector.getOpcode() == AMDGPUISD::BUILD_VERTICAL_VECTOR)
 947     return Op;
 948
 949   Vector = vectorToVerticalVector(DAG, Vector);
 950   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Op.getValueType(),
 951                      Vector, Index);
 952 }
 953
 954 SDValue R600TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
 955                                                    SelectionDAG &DAG) const {
 956   SDLoc DL(Op);
 957   SDValue Vector = Op.getOperand(0);
 958   SDValue Value = Op.getOperand(1);
 959   SDValue Index = Op.getOperand(2);
 960
 961   if (isa<ConstantSDNode>(Index) ||
 962       Vector.getOpcode() == AMDGPUISD::BUILD_VERTICAL_VECTOR)
 963     return Op;
 964
 965   Vector = vectorToVerticalVector(DAG, Vector);
 966   SDValue Insert = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, Op.getValueType(),
 967                                Vector, Value, Index);
 968   return vectorToVerticalVector(DAG, Insert);
 969 }
 970
 971 SDValue R600TargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
 972   // On hw >= R700, COS/SIN input must be between -1. and 1.
 973   // Thus we lower them to TRIG ( FRACT ( x / 2Pi + 0.5) - 0.5)
 974   EVT VT = Op.getValueType();
 975   SDValue Arg = Op.getOperand(0);
 976   SDValue FractPart = DAG.getNode(AMDGPUISD::FRACT, SDLoc(Op), VT,
 977       DAG.getNode(ISD::FADD, SDLoc(Op), VT,
 978         DAG.getNode(ISD::FMUL, SDLoc(Op), VT, Arg,
 979           DAG.getConstantFP(0.15915494309, MVT::f32)),
 980         DAG.getConstantFP(0.5, MVT::f32)));
 981   unsigned TrigNode;
 982   switch (Op.getOpcode()) {
 983   case ISD::FCOS:
 984     TrigNode = AMDGPUISD::COS_HW;
 985     break;
 986   case ISD::FSIN:
 987     TrigNode = AMDGPUISD::SIN_HW;
 988     break;
 989   default:
 990     llvm_unreachable("Wrong trig opcode");
 991   }
 992   SDValue TrigVal = DAG.getNode(TrigNode, SDLoc(Op), VT,
 993       DAG.getNode(ISD::FADD, SDLoc(Op), VT, FractPart,
 994         DAG.getConstantFP(-0.5, MVT::f32)));
 995   if (Gen >= AMDGPUSubtarget::R700)
 996     return TrigVal;
 997   // On R600 hw, COS/SIN input must be between -Pi and Pi.
 998   return DAG.getNode(ISD::FMUL, SDLoc(Op), VT, TrigVal,
 999       DAG.getConstantFP(3.14159265359, MVT::f32));
1000 }
1001
1002 SDValue R600TargetLowering::LowerSHLParts(SDValue Op, SelectionDAG &DAG) const {
1003   SDLoc DL(Op);
1004   EVT VT = Op.getValueType();
1005
1006   SDValue Lo = Op.getOperand(0);
1007   SDValue Hi = Op.getOperand(1);
1008   SDValue Shift = Op.getOperand(2);
1009   SDValue Zero = DAG.getConstant(0, VT);
1010   SDValue One  = DAG.getConstant(1, VT);
1011
1012   SDValue Width  = DAG.getConstant(VT.getSizeInBits(), VT);
1013   SDValue Width1 = DAG.getConstant(VT.getSizeInBits() - 1, VT);
1014   SDValue BigShift  = DAG.getNode(ISD::SUB, DL, VT, Shift, Width);
1015   SDValue CompShift = DAG.getNode(ISD::SUB, DL, VT, Width1, Shift);
1016
1017   // The dance around Width1 is necessary for 0 special case.
1018   // Without it the CompShift might be 32, producing incorrect results in
1019   // Overflow. So we do the shift in two steps, the alternative is to
1020   // add a conditional to filter the special case.
1021
1022   SDValue Overflow = DAG.getNode(ISD::SRL, DL, VT, Lo, CompShift);
1023   Overflow = DAG.getNode(ISD::SRL, DL, VT, Overflow, One);
1024
1025   SDValue HiSmall = DAG.getNode(ISD::SHL, DL, VT, Hi, Shift);
1026   HiSmall = DAG.getNode(ISD::OR, DL, VT, HiSmall, Overflow);
1027   SDValue LoSmall = DAG.getNode(ISD::SHL, DL, VT, Lo, Shift);
1028
1029   SDValue HiBig = DAG.getNode(ISD::SHL, DL, VT, Lo, BigShift);
1030   SDValue LoBig = Zero;
1031
1032   Hi = DAG.getSelectCC(DL, Shift, Width, HiSmall, HiBig, ISD::SETULT);
1033   Lo = DAG.getSelectCC(DL, Shift, Width, LoSmall, LoBig, ISD::SETULT);
1034
1035   return DAG.getNode(ISD::MERGE_VALUES, DL, DAG.getVTList(VT,VT), Lo, Hi);
1036 }
1037
1038 SDValue R600TargetLowering::LowerSRXParts(SDValue Op, SelectionDAG &DAG) const {
1039   SDLoc DL(Op);
1040   EVT VT = Op.getValueType();
1041
1042   SDValue Lo = Op.getOperand(0);
1043   SDValue Hi = Op.getOperand(1);
1044   SDValue Shift = Op.getOperand(2);
1045   SDValue Zero = DAG.getConstant(0, VT);
1046   SDValue One  = DAG.getConstant(1, VT);
1047
1048   const bool SRA = Op.getOpcode() == ISD::SRA_PARTS;
1049
1050   SDValue Width  = DAG.getConstant(VT.getSizeInBits(), VT);
1051   SDValue Width1 = DAG.getConstant(VT.getSizeInBits() - 1, VT);
1052   SDValue BigShift  = DAG.getNode(ISD::SUB, DL, VT, Shift, Width);
1053   SDValue CompShift = DAG.getNode(ISD::SUB, DL, VT, Width1, Shift);
1054
1055   // The dance around Width1 is necessary for 0 special case.
1056   // Without it the CompShift might be 32, producing incorrect results in
1057   // Overflow. So we do the shift in two steps, the alternative is to
1058   // add a conditional to filter the special case.
1059
1060   SDValue Overflow = DAG.getNode(ISD::SHL, DL, VT, Hi, CompShift);
1061   Overflow = DAG.getNode(ISD::SHL, DL, VT, Overflow, One);
1062
1063   SDValue HiSmall = DAG.getNode(SRA ? ISD::SRA : ISD::SRL, DL, VT, Hi, Shift);
1064   SDValue LoSmall = DAG.getNode(ISD::SRL, DL, VT, Lo, Shift);
1065   LoSmall = DAG.getNode(ISD::OR, DL, VT, LoSmall, Overflow);
1066
1067   SDValue LoBig = DAG.getNode(SRA ? ISD::SRA : ISD::SRL, DL, VT, Hi, BigShift);
1068   SDValue HiBig = SRA ? DAG.getNode(ISD::SRA, DL, VT, Hi, Width1) : Zero;
1069
1070   Hi = DAG.getSelectCC(DL, Shift, Width, HiSmall, HiBig, ISD::SETULT);
1071   Lo = DAG.getSelectCC(DL, Shift, Width, LoSmall, LoBig, ISD::SETULT);
1072
1073   return DAG.getNode(ISD::MERGE_VALUES, DL, DAG.getVTList(VT,VT), Lo, Hi);
1074 }
1075
1076 SDValue R600TargetLowering::LowerFPTOUINT(SDValue Op, SelectionDAG &DAG) const {
1077   return DAG.getNode(
1078       ISD::SETCC,
1079       SDLoc(Op),
1080       MVT::i1,
1081       Op, DAG.getConstantFP(0.0f, MVT::f32),
1082       DAG.getCondCode(ISD::SETNE)
1083       );
1084 }
1085
1086 SDValue R600TargetLowering::LowerImplicitParameter(SelectionDAG &DAG, EVT VT,
1087                                                    SDLoc DL,
1088                                                    unsigned DwordOffset) const {
1089   unsigned ByteOffset = DwordOffset * 4;
1090   PointerType * PtrType = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
1091                                       AMDGPUAS::CONSTANT_BUFFER_0);
1092
1093   // We shouldn't be using an offset wider than 16-bits for implicit parameters.
1094   assert(isInt<16>(ByteOffset));
1095
1096   return DAG.getLoad(VT, DL, DAG.getEntryNode(),
1097                      DAG.getConstant(ByteOffset, MVT::i32), // PTR
1098                      MachinePointerInfo(ConstantPointerNull::get(PtrType)),
1099                      false, false, false, 0);
1100 }
1101
1102 bool R600TargetLowering::isZero(SDValue Op) const {
1103   if(ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
1104     return Cst->isNullValue();
1105   } else if(ConstantFPSDNode *CstFP = dyn_cast<ConstantFPSDNode>(Op)){
1106     return CstFP->isZero();
1107   } else {
1108     return false;
1109   }
1110 }
1111
1112 SDValue R600TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
1113   SDLoc DL(Op);
1114   EVT VT = Op.getValueType();
1115
1116   SDValue LHS = Op.getOperand(0);
1117   SDValue RHS = Op.getOperand(1);
1118   SDValue True = Op.getOperand(2);
1119   SDValue False = Op.getOperand(3);
1120   SDValue CC = Op.getOperand(4);
1121   SDValue Temp;
1122
1123   if (VT == MVT::f32) {
1124     DAGCombinerInfo DCI(DAG, AfterLegalizeVectorOps, true, nullptr);
1125     SDValue MinMax = CombineFMinMaxLegacy(DL, VT, LHS, RHS, True, False, CC, DCI);
1126     if (MinMax)
1127       return MinMax;
1128   }
1129
1130   // LHS and RHS are guaranteed to be the same value type
1131   EVT CompareVT = LHS.getValueType();
1132
1133   // Check if we can lower this to a native operation.
1134
1135   // Try to lower to a SET* instruction:
1136   //
1137   // SET* can match the following patterns:
1138   //
1139   // select_cc f32, f32, -1,  0, cc_supported
1140   // select_cc f32, f32, 1.0f, 0.0f, cc_supported
1141   // select_cc i32, i32, -1,  0, cc_supported
1142   //
1143
1144   // Move hardware True/False values to the correct operand.
1145   ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
1146   ISD::CondCode InverseCC =
1147      ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32);
1148   if (isHWTrueValue(False) && isHWFalseValue(True)) {
1149     if (isCondCodeLegal(InverseCC, CompareVT.getSimpleVT())) {
1150       std::swap(False, True);
1151       CC = DAG.getCondCode(InverseCC);
1152     } else {
1153       ISD::CondCode SwapInvCC = ISD::getSetCCSwappedOperands(InverseCC);
1154       if (isCondCodeLegal(SwapInvCC, CompareVT.getSimpleVT())) {
1155         std::swap(False, True);
1156         std::swap(LHS, RHS);
1157         CC = DAG.getCondCode(SwapInvCC);
1158       }
1159     }
1160   }
1161
1162   if (isHWTrueValue(True) && isHWFalseValue(False) &&
1163       (CompareVT == VT || VT == MVT::i32)) {
1164     // This can be matched by a SET* instruction.
1165     return DAG.getNode(ISD::SELECT_CC, DL, VT, LHS, RHS, True, False, CC);
1166   }
1167
1168   // Try to lower to a CND* instruction:
1169   //
1170   // CND* can match the following patterns:
1171   //
1172   // select_cc f32, 0.0, f32, f32, cc_supported
1173   // select_cc f32, 0.0, i32, i32, cc_supported
1174   // select_cc i32, 0,   f32, f32, cc_supported
1175   // select_cc i32, 0,   i32, i32, cc_supported
1176   //
1177
1178   // Try to move the zero value to the RHS
1179   if (isZero(LHS)) {
1180     ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
1181     // Try swapping the operands
1182     ISD::CondCode CCSwapped = ISD::getSetCCSwappedOperands(CCOpcode);
1183     if (isCondCodeLegal(CCSwapped, CompareVT.getSimpleVT())) {
1184       std::swap(LHS, RHS);
1185       CC = DAG.getCondCode(CCSwapped);
1186     } else {
1187       // Try inverting the conditon and then swapping the operands
1188       ISD::CondCode CCInv = ISD::getSetCCInverse(CCOpcode, CompareVT.isInteger());
1189       CCSwapped = ISD::getSetCCSwappedOperands(CCInv);
1190       if (isCondCodeLegal(CCSwapped, CompareVT.getSimpleVT())) {
1191         std::swap(True, False);
1192         std::swap(LHS, RHS);
1193         CC = DAG.getCondCode(CCSwapped);
1194       }
1195     }
1196   }
1197   if (isZero(RHS)) {
1198     SDValue Cond = LHS;
1199     SDValue Zero = RHS;
1200     ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
1201     if (CompareVT != VT) {
1202       // Bitcast True / False to the correct types.  This will end up being
1203       // a nop, but it allows us to define only a single pattern in the
1204       // .TD files for each CND* instruction rather than having to have
1205       // one pattern for integer True/False and one for fp True/False
1206       True = DAG.getNode(ISD::BITCAST, DL, CompareVT, True);
1207       False = DAG.getNode(ISD::BITCAST, DL, CompareVT, False);
1208     }
1209
1210     switch (CCOpcode) {
1211     case ISD::SETONE:
1212     case ISD::SETUNE:
1213     case ISD::SETNE:
1214       CCOpcode = ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32);
1215       Temp = True;
1216       True = False;
1217       False = Temp;
1218       break;
1219     default:
1220       break;
1221     }
1222     SDValue SelectNode = DAG.getNode(ISD::SELECT_CC, DL, CompareVT,
1223         Cond, Zero,
1224         True, False,
1225         DAG.getCondCode(CCOpcode));
1226     return DAG.getNode(ISD::BITCAST, DL, VT, SelectNode);
1227   }
1228
1229   // If we make it this for it means we have no native instructions to handle
1230   // this SELECT_CC, so we must lower it.
1231   SDValue HWTrue, HWFalse;
1232
1233   if (CompareVT == MVT::f32) {
1234     HWTrue = DAG.getConstantFP(1.0f, CompareVT);
1235     HWFalse = DAG.getConstantFP(0.0f, CompareVT);
1236   } else if (CompareVT == MVT::i32) {
1237     HWTrue = DAG.getConstant(-1, CompareVT);
1238     HWFalse = DAG.getConstant(0, CompareVT);
1239   }
1240   else {
1241     llvm_unreachable("Unhandled value type in LowerSELECT_CC");
1242   }
1243
1244   // Lower this unsupported SELECT_CC into a combination of two supported
1245   // SELECT_CC operations.
1246   SDValue Cond = DAG.getNode(ISD::SELECT_CC, DL, CompareVT, LHS, RHS, HWTrue, HWFalse, CC);
1247
1248   return DAG.getNode(ISD::SELECT_CC, DL, VT,
1249       Cond, HWFalse,
1250       True, False,
1251       DAG.getCondCode(ISD::SETNE));
1252 }
1253
1254 /// LLVM generates byte-addressed pointers.  For indirect addressing, we need to
1255 /// convert these pointers to a register index.  Each register holds
1256 /// 16 bytes, (4 x 32bit sub-register), but we need to take into account the
1257 /// \p StackWidth, which tells us how many of the 4 sub-registrers will be used
1258 /// for indirect addressing.
1259 SDValue R600TargetLowering::stackPtrToRegIndex(SDValue Ptr,
1260                                                unsigned StackWidth,
1261                                                SelectionDAG &DAG) const {
1262   unsigned SRLPad;
1263   switch(StackWidth) {
1264   case 1:
1265     SRLPad = 2;
1266     break;
1267   case 2:
1268     SRLPad = 3;
1269     break;
1270   case 4:
1271     SRLPad = 4;
1272     break;
1273   default: llvm_unreachable("Invalid stack width");
1274   }
1275
1276   return DAG.getNode(ISD::SRL, SDLoc(Ptr), Ptr.getValueType(), Ptr,
1277                      DAG.getConstant(SRLPad, MVT::i32));
1278 }
1279
1280 void R600TargetLowering::getStackAddress(unsigned StackWidth,
1281                                          unsigned ElemIdx,
1282                                          unsigned &Channel,
1283                                          unsigned &PtrIncr) const {
1284   switch (StackWidth) {
1285   default:
1286   case 1:
1287     Channel = 0;
1288     if (ElemIdx > 0) {
1289       PtrIncr = 1;
1290     } else {
1291       PtrIncr = 0;
1292     }
1293     break;
1294   case 2:
1295     Channel = ElemIdx % 2;
1296     if (ElemIdx == 2) {
1297       PtrIncr = 1;
1298     } else {
1299       PtrIncr = 0;
1300     }
1301     break;
1302   case 4:
1303     Channel = ElemIdx;
1304     PtrIncr = 0;
1305     break;
1306   }
1307 }
1308
1309 SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
1310   SDLoc DL(Op);
1311   StoreSDNode *StoreNode = cast<StoreSDNode>(Op);
1312   SDValue Chain = Op.getOperand(0);
1313   SDValue Value = Op.getOperand(1);
1314   SDValue Ptr = Op.getOperand(2);
1315
1316   SDValue Result = AMDGPUTargetLowering::LowerSTORE(Op, DAG);
1317   if (Result.getNode()) {
1318     return Result;
1319   }
1320
1321   if (StoreNode->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS) {
1322     if (StoreNode->isTruncatingStore()) {
1323       EVT VT = Value.getValueType();
1324       assert(VT.bitsLE(MVT::i32));
1325       EVT MemVT = StoreNode->getMemoryVT();
1326       SDValue MaskConstant;
1327       if (MemVT == MVT::i8) {
1328         MaskConstant = DAG.getConstant(0xFF, MVT::i32);
1329       } else {
1330         assert(MemVT == MVT::i16);
1331         MaskConstant = DAG.getConstant(0xFFFF, MVT::i32);
1332       }
1333       SDValue DWordAddr = DAG.getNode(ISD::SRL, DL, VT, Ptr,
1334                                       DAG.getConstant(2, MVT::i32));
1335       SDValue ByteIndex = DAG.getNode(ISD::AND, DL, Ptr.getValueType(), Ptr,
1336                                       DAG.getConstant(0x00000003, VT));
1337       SDValue TruncValue = DAG.getNode(ISD::AND, DL, VT, Value, MaskConstant);
1338       SDValue Shift = DAG.getNode(ISD::SHL, DL, VT, ByteIndex,
1339                                    DAG.getConstant(3, VT));
1340       SDValue ShiftedValue = DAG.getNode(ISD::SHL, DL, VT, TruncValue, Shift);
1341       SDValue Mask = DAG.getNode(ISD::SHL, DL, VT, MaskConstant, Shift);
1342       // XXX: If we add a 64-bit ZW register class, then we could use a 2 x i32
1343       // vector instead.
1344       SDValue Src[4] = {
1345         ShiftedValue,
1346         DAG.getConstant(0, MVT::i32),
1347         DAG.getConstant(0, MVT::i32),
1348         Mask
1349       };
1350       SDValue Input = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v4i32, Src);
1351       SDValue Args[3] = { Chain, Input, DWordAddr };
1352       return DAG.getMemIntrinsicNode(AMDGPUISD::STORE_MSKOR, DL,
1353                                      Op->getVTList(), Args, MemVT,
1354                                      StoreNode->getMemOperand());
1355     } else if (Ptr->getOpcode() != AMDGPUISD::DWORDADDR &&
1356                Value.getValueType().bitsGE(MVT::i32)) {
1357       // Convert pointer from byte address to dword address.
1358       Ptr = DAG.getNode(AMDGPUISD::DWORDADDR, DL, Ptr.getValueType(),
1359                         DAG.getNode(ISD::SRL, DL, Ptr.getValueType(),
1360                                     Ptr, DAG.getConstant(2, MVT::i32)));
1361
1362       if (StoreNode->isTruncatingStore() || StoreNode->isIndexed()) {
1363         llvm_unreachable("Truncated and indexed stores not supported yet");
1364       } else {
1365         Chain = DAG.getStore(Chain, DL, Value, Ptr, StoreNode->getMemOperand());
1366       }
1367       return Chain;
1368     }
1369   }
1370
1371   EVT ValueVT = Value.getValueType();
1372
1373   if (StoreNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) {
1374     return SDValue();
1375   }
1376
1377   SDValue Ret = AMDGPUTargetLowering::LowerSTORE(Op, DAG);
1378   if (Ret.getNode()) {
1379     return Ret;
1380   }
1381   // Lowering for indirect addressing
1382
1383   const MachineFunction &MF = DAG.getMachineFunction();
1384   const AMDGPUFrameLowering *TFL = static_cast<const AMDGPUFrameLowering *>(
1385       getTargetMachine().getSubtargetImpl()->getFrameLowering());
1386   unsigned StackWidth = TFL->getStackWidth(MF);
1387
1388   Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG);
1389
1390   if (ValueVT.isVector()) {
1391     unsigned NumElemVT = ValueVT.getVectorNumElements();
1392     EVT ElemVT = ValueVT.getVectorElementType();
1393     SmallVector<SDValue, 4> Stores(NumElemVT);
1394
1395     assert(NumElemVT >= StackWidth && "Stack width cannot be greater than "
1396                                       "vector width in load");
1397
1398     for (unsigned i = 0; i < NumElemVT; ++i) {
1399       unsigned Channel, PtrIncr;
1400       getStackAddress(StackWidth, i, Channel, PtrIncr);
1401       Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr,
1402                         DAG.getConstant(PtrIncr, MVT::i32));
1403       SDValue Elem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ElemVT,
1404                                  Value, DAG.getConstant(i, MVT::i32));
1405
1406       Stores[i] = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other,
1407                               Chain, Elem, Ptr,
1408                               DAG.getTargetConstant(Channel, MVT::i32));
1409     }
1410      Chain =  DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores);
1411    } else {
1412     if (ValueVT == MVT::i8) {
1413       Value = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Value);
1414     }
1415     Chain = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other, Chain, Value, Ptr,
1416     DAG.getTargetConstant(0, MVT::i32)); // Channel
1417   }
1418
1419   return Chain;
1420 }
1421
1422 // return (512 + (kc_bank << 12)
1423 static int
1424 ConstantAddressBlock(unsigned AddressSpace) {
1425   switch (AddressSpace) {
1426   case AMDGPUAS::CONSTANT_BUFFER_0:
1427     return 512;
1428   case AMDGPUAS::CONSTANT_BUFFER_1:
1429     return 512 + 4096;
1430   case AMDGPUAS::CONSTANT_BUFFER_2:
1431     return 512 + 4096 * 2;
1432   case AMDGPUAS::CONSTANT_BUFFER_3:
1433     return 512 + 4096 * 3;
1434   case AMDGPUAS::CONSTANT_BUFFER_4:
1435     return 512 + 4096 * 4;
1436   case AMDGPUAS::CONSTANT_BUFFER_5:
1437     return 512 + 4096 * 5;
1438   case AMDGPUAS::CONSTANT_BUFFER_6:
1439     return 512 + 4096 * 6;
1440   case AMDGPUAS::CONSTANT_BUFFER_7:
1441     return 512 + 4096 * 7;
1442   case AMDGPUAS::CONSTANT_BUFFER_8:
1443     return 512 + 4096 * 8;
1444   case AMDGPUAS::CONSTANT_BUFFER_9:
1445     return 512 + 4096 * 9;
1446   case AMDGPUAS::CONSTANT_BUFFER_10:
1447     return 512 + 4096 * 10;
1448   case AMDGPUAS::CONSTANT_BUFFER_11:
1449     return 512 + 4096 * 11;
1450   case AMDGPUAS::CONSTANT_BUFFER_12:
1451     return 512 + 4096 * 12;
1452   case AMDGPUAS::CONSTANT_BUFFER_13:
1453     return 512 + 4096 * 13;
1454   case AMDGPUAS::CONSTANT_BUFFER_14:
1455     return 512 + 4096 * 14;
1456   case AMDGPUAS::CONSTANT_BUFFER_15:
1457     return 512 + 4096 * 15;
1458   default:
1459     return -1;
1460   }
1461 }
1462
1463 SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const
1464 {
1465   EVT VT = Op.getValueType();
1466   SDLoc DL(Op);
1467   LoadSDNode *LoadNode = cast<LoadSDNode>(Op);
1468   SDValue Chain = Op.getOperand(0);
1469   SDValue Ptr = Op.getOperand(1);
1470   SDValue LoweredLoad;
1471
1472   SDValue Ret = AMDGPUTargetLowering::LowerLOAD(Op, DAG);
1473   if (Ret.getNode()) {
1474     SDValue Ops[2] = {
1475       Ret,
1476       Chain
1477     };
1478     return DAG.getMergeValues(Ops, DL);
1479   }
1480
1481   // Lower loads constant address space global variable loads
1482   if (LoadNode->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS &&
1483       isa<GlobalVariable>(
1484           GetUnderlyingObject(LoadNode->getMemOperand()->getValue()))) {
1485
1486     SDValue Ptr = DAG.getZExtOrTrunc(LoadNode->getBasePtr(), DL,
1487         getPointerTy(AMDGPUAS::PRIVATE_ADDRESS));
1488     Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr,
1489         DAG.getConstant(2, MVT::i32));
1490     return DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, Op->getVTList(),
1491                        LoadNode->getChain(), Ptr,
1492                        DAG.getTargetConstant(0, MVT::i32), Op.getOperand(2));
1493   }
1494
1495   if (LoadNode->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS && VT.isVector()) {
1496     SDValue MergedValues[2] = {
1497       ScalarizeVectorLoad(Op, DAG),
1498       Chain
1499     };
1500     return DAG.getMergeValues(MergedValues, DL);
1501   }
1502
1503   int ConstantBlock = ConstantAddressBlock(LoadNode->getAddressSpace());
1504   if (ConstantBlock > -1 &&
1505       ((LoadNode->getExtensionType() == ISD::NON_EXTLOAD) ||
1506        (LoadNode->getExtensionType() == ISD::ZEXTLOAD))) {
1507     SDValue Result;
1508     if (isa<ConstantExpr>(LoadNode->getMemOperand()->getValue()) ||
1509         isa<Constant>(LoadNode->getMemOperand()->getValue()) ||
1510         isa<ConstantSDNode>(Ptr)) {
1511       SDValue Slots[4];
1512       for (unsigned i = 0; i < 4; i++) {
1513         // We want Const position encoded with the following formula :
1514         // (((512 + (kc_bank << 12) + const_index) << 2) + chan)
1515         // const_index is Ptr computed by llvm using an alignment of 16.
1516         // Thus we add (((512 + (kc_bank << 12)) + chan ) * 4 here and
1517         // then div by 4 at the ISel step
1518         SDValue NewPtr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
1519             DAG.getConstant(4 * i + ConstantBlock * 16, MVT::i32));
1520         Slots[i] = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::i32, NewPtr);
1521       }
1522       EVT NewVT = MVT::v4i32;
1523       unsigned NumElements = 4;
1524       if (VT.isVector()) {
1525         NewVT = VT;
1526         NumElements = VT.getVectorNumElements();
1527       }
1528       Result = DAG.getNode(ISD::BUILD_VECTOR, DL, NewVT,
1529                            makeArrayRef(Slots, NumElements));
1530     } else {
1531       // non-constant ptr can't be folded, keeps it as a v4f32 load
1532       Result = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::v4i32,
1533           DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr, DAG.getConstant(4, MVT::i32)),
1534           DAG.getConstant(LoadNode->getAddressSpace() -
1535                           AMDGPUAS::CONSTANT_BUFFER_0, MVT::i32)
1536           );
1537     }
1538
1539     if (!VT.isVector()) {
1540       Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Result,
1541           DAG.getConstant(0, MVT::i32));
1542     }
1543
1544     SDValue MergedValues[2] = {
1545       Result,
1546       Chain
1547     };
1548     return DAG.getMergeValues(MergedValues, DL);
1549   }
1550
1551   // For most operations returning SDValue() will result in the node being
1552   // expanded by the DAG Legalizer. This is not the case for ISD::LOAD, so we
1553   // need to manually expand loads that may be legal in some address spaces and
1554   // illegal in others. SEXT loads from CONSTANT_BUFFER_0 are supported for
1555   // compute shaders, since the data is sign extended when it is uploaded to the
1556   // buffer. However SEXT loads from other address spaces are not supported, so
1557   // we need to expand them here.
1558   if (LoadNode->getExtensionType() == ISD::SEXTLOAD) {
1559     EVT MemVT = LoadNode->getMemoryVT();
1560     assert(!MemVT.isVector() && (MemVT == MVT::i16 || MemVT == MVT::i8));
1561     SDValue ShiftAmount =
1562           DAG.getConstant(VT.getSizeInBits() - MemVT.getSizeInBits(), MVT::i32);
1563     SDValue NewLoad = DAG.getExtLoad(ISD::EXTLOAD, DL, VT, Chain, Ptr,
1564                                   LoadNode->getPointerInfo(), MemVT,
1565                                   LoadNode->isVolatile(),
1566                                   LoadNode->isNonTemporal(),
1567                                   LoadNode->isInvariant(),
1568                                   LoadNode->getAlignment());
1569     SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, NewLoad, ShiftAmount);
1570     SDValue Sra = DAG.getNode(ISD::SRA, DL, VT, Shl, ShiftAmount);
1571
1572     SDValue MergedValues[2] = { Sra, Chain };
1573     return DAG.getMergeValues(MergedValues, DL);
1574   }
1575
1576   if (LoadNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) {
1577     return SDValue();
1578   }
1579
1580   // Lowering for indirect addressing
1581   const MachineFunction &MF = DAG.getMachineFunction();
1582   const AMDGPUFrameLowering *TFL = static_cast<const AMDGPUFrameLowering *>(
1583       getTargetMachine().getSubtargetImpl()->getFrameLowering());
1584   unsigned StackWidth = TFL->getStackWidth(MF);
1585
1586   Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG);
1587
1588   if (VT.isVector()) {
1589     unsigned NumElemVT = VT.getVectorNumElements();
1590     EVT ElemVT = VT.getVectorElementType();
1591     SDValue Loads[4];
1592
1593     assert(NumElemVT >= StackWidth && "Stack width cannot be greater than "
1594                                       "vector width in load");
1595
1596     for (unsigned i = 0; i < NumElemVT; ++i) {
1597       unsigned Channel, PtrIncr;
1598       getStackAddress(StackWidth, i, Channel, PtrIncr);
1599       Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr,
1600                         DAG.getConstant(PtrIncr, MVT::i32));
1601       Loads[i] = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, ElemVT,
1602                              Chain, Ptr,
1603                              DAG.getTargetConstant(Channel, MVT::i32),
1604                              Op.getOperand(2));
1605     }
1606     for (unsigned i = NumElemVT; i < 4; ++i) {
1607       Loads[i] = DAG.getUNDEF(ElemVT);
1608     }
1609     EVT TargetVT = EVT::getVectorVT(*DAG.getContext(), ElemVT, 4);
1610     LoweredLoad = DAG.getNode(ISD::BUILD_VECTOR, DL, TargetVT, Loads);
1611   } else {
1612     LoweredLoad = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, VT,
1613                               Chain, Ptr,
1614                               DAG.getTargetConstant(0, MVT::i32), // Channel
1615                               Op.getOperand(2));
1616   }
1617
1618   SDValue Ops[2] = {
1619     LoweredLoad,
1620     Chain
1621   };
1622
1623   return DAG.getMergeValues(Ops, DL);
1624 }
1625
1626 SDValue R600TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
1627   SDValue Chain = Op.getOperand(0);
1628   SDValue Cond  = Op.getOperand(1);
1629   SDValue Jump  = Op.getOperand(2);
1630
1631   return DAG.getNode(AMDGPUISD::BRANCH_COND, SDLoc(Op), Op.getValueType(),
1632                      Chain, Jump, Cond);
1633 }
1634
1635 /// XXX Only kernel functions are supported, so we can assume for now that
1636 /// every function is a kernel function, but in the future we should use
1637 /// separate calling conventions for kernel and non-kernel functions.
1638 SDValue R600TargetLowering::LowerFormalArguments(
1639                                       SDValue Chain,
1640                                       CallingConv::ID CallConv,
1641                                       bool isVarArg,
1642                                       const SmallVectorImpl<ISD::InputArg> &Ins,
1643                                       SDLoc DL, SelectionDAG &DAG,
1644                                       SmallVectorImpl<SDValue> &InVals) const {
1645   SmallVector<CCValAssign, 16> ArgLocs;
1646   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
1647                  *DAG.getContext());
1648   MachineFunction &MF = DAG.getMachineFunction();
1649   R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
1650
1651   SmallVector<ISD::InputArg, 8> LocalIns;
1652
1653   getOriginalFunctionArgs(DAG, MF.getFunction(), Ins, LocalIns);
1654
1655   AnalyzeFormalArguments(CCInfo, LocalIns);
1656
1657   for (unsigned i = 0, e = Ins.size(); i < e; ++i) {
1658     CCValAssign &VA = ArgLocs[i];
1659     const ISD::InputArg &In = Ins[i];
1660     EVT VT = In.VT;
1661     EVT MemVT = VA.getLocVT();
1662     if (!VT.isVector() && MemVT.isVector()) {
1663       // Get load source type if scalarized.
1664       MemVT = MemVT.getVectorElementType();
1665     }
1666
1667     if (MFI->getShaderType() != ShaderType::COMPUTE) {
1668       unsigned Reg = MF.addLiveIn(VA.getLocReg(), &AMDGPU::R600_Reg128RegClass);
1669       SDValue Register = DAG.getCopyFromReg(Chain, DL, Reg, VT);
1670       InVals.push_back(Register);
1671       continue;
1672     }
1673
1674     PointerType *PtrTy = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
1675                                           AMDGPUAS::CONSTANT_BUFFER_0);
1676
1677     // i64 isn't a legal type, so the register type used ends up as i32, which
1678     // isn't expected here. It attempts to create this sextload, but it ends up
1679     // being invalid. Somehow this seems to work with i64 arguments, but breaks
1680     // for <1 x i64>.
1681
1682     // The first 36 bytes of the input buffer contains information about
1683     // thread group and global sizes.
1684     ISD::LoadExtType Ext = ISD::NON_EXTLOAD;
1685     if (MemVT.getScalarSizeInBits() != VT.getScalarSizeInBits()) {
1686       // FIXME: This should really check the extload type, but the handling of
1687       // extload vector parameters seems to be broken.
1688
1689       // Ext = In.Flags.isSExt() ? ISD::SEXTLOAD : ISD::ZEXTLOAD;
1690       Ext = ISD::SEXTLOAD;
1691     }
1692
1693     // Compute the offset from the value.
1694     // XXX - I think PartOffset should give you this, but it seems to give the
1695     // size of the register which isn't useful.
1696
1697     unsigned ValBase = ArgLocs[In.OrigArgIndex].getLocMemOffset();
1698     unsigned PartOffset = VA.getLocMemOffset();
1699     unsigned Offset = 36 + VA.getLocMemOffset();
1700
1701     MachinePointerInfo PtrInfo(UndefValue::get(PtrTy), PartOffset - ValBase);
1702     SDValue Arg = DAG.getLoad(ISD::UNINDEXED, Ext, VT, DL, Chain,
1703                               DAG.getConstant(Offset, MVT::i32),
1704                               DAG.getUNDEF(MVT::i32),
1705                               PtrInfo,
1706                               MemVT, false, true, true, 4);
1707
1708     // 4 is the preferred alignment for the CONSTANT memory space.
1709     InVals.push_back(Arg);
1710     MFI->ABIArgOffset = Offset + MemVT.getStoreSize();
1711   }
1712   return Chain;
1713 }
1714
1715 EVT R600TargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const {
1716    if (!VT.isVector())
1717      return MVT::i32;
1718    return VT.changeVectorElementTypeToInteger();
1719 }
1720
1721 static SDValue CompactSwizzlableVector(
1722   SelectionDAG &DAG, SDValue VectorEntry,
1723   DenseMap<unsigned, unsigned> &RemapSwizzle) {
1724   assert(VectorEntry.getOpcode() == ISD::BUILD_VECTOR);
1725   assert(RemapSwizzle.empty());
1726   SDValue NewBldVec[4] = {
1727     VectorEntry.getOperand(0),
1728     VectorEntry.getOperand(1),
1729     VectorEntry.getOperand(2),
1730     VectorEntry.getOperand(3)
1731   };
1732
1733   for (unsigned i = 0; i < 4; i++) {
1734     if (NewBldVec[i].getOpcode() == ISD::UNDEF)
1735       // We mask write here to teach later passes that the ith element of this
1736       // vector is undef. Thus we can use it to reduce 128 bits reg usage,
1737       // break false dependencies and additionnaly make assembly easier to read.
1738       RemapSwizzle[i] = 7; // SEL_MASK_WRITE
1739     if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(NewBldVec[i])) {
1740       if (C->isZero()) {
1741         RemapSwizzle[i] = 4; // SEL_0
1742         NewBldVec[i] = DAG.getUNDEF(MVT::f32);
1743       } else if (C->isExactlyValue(1.0)) {
1744         RemapSwizzle[i] = 5; // SEL_1
1745         NewBldVec[i] = DAG.getUNDEF(MVT::f32);
1746       }
1747     }
1748
1749     if (NewBldVec[i].getOpcode() == ISD::UNDEF)
1750       continue;
1751     for (unsigned j = 0; j < i; j++) {
1752       if (NewBldVec[i] == NewBldVec[j]) {
1753         NewBldVec[i] = DAG.getUNDEF(NewBldVec[i].getValueType());
1754         RemapSwizzle[i] = j;
1755         break;
1756       }
1757     }
1758   }
1759
1760   return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(VectorEntry),
1761                      VectorEntry.getValueType(), NewBldVec);
1762 }
1763
1764 static SDValue ReorganizeVector(SelectionDAG &DAG, SDValue VectorEntry,
1765                                 DenseMap<unsigned, unsigned> &RemapSwizzle) {
1766   assert(VectorEntry.getOpcode() == ISD::BUILD_VECTOR);
1767   assert(RemapSwizzle.empty());
1768   SDValue NewBldVec[4] = {
1769       VectorEntry.getOperand(0),
1770       VectorEntry.getOperand(1),
1771       VectorEntry.getOperand(2),
1772       VectorEntry.getOperand(3)
1773   };
1774   bool isUnmovable[4] = { false, false, false, false };
1775   for (unsigned i = 0; i < 4; i++) {
1776     RemapSwizzle[i] = i;
1777     if (NewBldVec[i].getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
1778       unsigned Idx = dyn_cast<ConstantSDNode>(NewBldVec[i].getOperand(1))
1779           ->getZExtValue();
1780       if (i == Idx)
1781         isUnmovable[Idx] = true;
1782     }
1783   }
1784
1785   for (unsigned i = 0; i < 4; i++) {
1786     if (NewBldVec[i].getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
1787       unsigned Idx = dyn_cast<ConstantSDNode>(NewBldVec[i].getOperand(1))
1788           ->getZExtValue();
1789       if (isUnmovable[Idx])
1790         continue;
1791       // Swap i and Idx
1792       std::swap(NewBldVec[Idx], NewBldVec[i]);
1793       std::swap(RemapSwizzle[i], RemapSwizzle[Idx]);
1794       break;
1795     }
1796   }
1797
1798   return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(VectorEntry),
1799                      VectorEntry.getValueType(), NewBldVec);
1800 }
1801
1802
1803 SDValue R600TargetLowering::OptimizeSwizzle(SDValue BuildVector,
1804 SDValue Swz[4], SelectionDAG &DAG) const {
1805   assert(BuildVector.getOpcode() == ISD::BUILD_VECTOR);
1806   // Old -> New swizzle values
1807   DenseMap<unsigned, unsigned> SwizzleRemap;
1808
1809   BuildVector = CompactSwizzlableVector(DAG, BuildVector, SwizzleRemap);
1810   for (unsigned i = 0; i < 4; i++) {
1811     unsigned Idx = dyn_cast<ConstantSDNode>(Swz[i])->getZExtValue();
1812     if (SwizzleRemap.find(Idx) != SwizzleRemap.end())
1813       Swz[i] = DAG.getConstant(SwizzleRemap[Idx], MVT::i32);
1814   }
1815
1816   SwizzleRemap.clear();
1817   BuildVector = ReorganizeVector(DAG, BuildVector, SwizzleRemap);
1818   for (unsigned i = 0; i < 4; i++) {
1819     unsigned Idx = dyn_cast<ConstantSDNode>(Swz[i])->getZExtValue();
1820     if (SwizzleRemap.find(Idx) != SwizzleRemap.end())
1821       Swz[i] = DAG.getConstant(SwizzleRemap[Idx], MVT::i32);
1822   }
1823
1824   return BuildVector;
1825 }
1826
1827
1828 //===----------------------------------------------------------------------===//
1829 // Custom DAG Optimizations
1830 //===----------------------------------------------------------------------===//
1831
1832 SDValue R600TargetLowering::PerformDAGCombine(SDNode *N,
1833                                               DAGCombinerInfo &DCI) const {
1834   SelectionDAG &DAG = DCI.DAG;
1835
1836   switch (N->getOpcode()) {
1837   default: return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
1838   // (f32 fp_round (f64 uint_to_fp a)) -> (f32 uint_to_fp a)
1839   case ISD::FP_ROUND: {
1840       SDValue Arg = N->getOperand(0);
1841       if (Arg.getOpcode() == ISD::UINT_TO_FP && Arg.getValueType() == MVT::f64) {
1842         return DAG.getNode(ISD::UINT_TO_FP, SDLoc(N), N->getValueType(0),
1843                            Arg.getOperand(0));
1844       }
1845       break;
1846     }
1847
1848   // (i32 fp_to_sint (fneg (select_cc f32, f32, 1.0, 0.0 cc))) ->
1849   // (i32 select_cc f32, f32, -1, 0 cc)
1850   //
1851   // Mesa's GLSL frontend generates the above pattern a lot and we can lower
1852   // this to one of the SET*_DX10 instructions.
1853   case ISD::FP_TO_SINT: {
1854     SDValue FNeg = N->getOperand(0);
1855     if (FNeg.getOpcode() != ISD::FNEG) {
1856       return SDValue();
1857     }
1858     SDValue SelectCC = FNeg.getOperand(0);
1859     if (SelectCC.getOpcode() != ISD::SELECT_CC ||
1860         SelectCC.getOperand(0).getValueType() != MVT::f32 || // LHS
1861         SelectCC.getOperand(2).getValueType() != MVT::f32 || // True
1862         !isHWTrueValue(SelectCC.getOperand(2)) ||
1863         !isHWFalseValue(SelectCC.getOperand(3))) {
1864       return SDValue();
1865     }
1866
1867     return DAG.getNode(ISD::SELECT_CC, SDLoc(N), N->getValueType(0),
1868                            SelectCC.getOperand(0), // LHS
1869                            SelectCC.getOperand(1), // RHS
1870                            DAG.getConstant(-1, MVT::i32), // True
1871                            DAG.getConstant(0, MVT::i32),  // Flase
1872                            SelectCC.getOperand(4)); // CC
1873
1874     break;
1875   }
1876
1877   // insert_vector_elt (build_vector elt0, ... , eltN), NewEltIdx, idx
1878   // => build_vector elt0, ... , NewEltIdx, ... , eltN
1879   case ISD::INSERT_VECTOR_ELT: {
1880     SDValue InVec = N->getOperand(0);
1881     SDValue InVal = N->getOperand(1);
1882     SDValue EltNo = N->getOperand(2);
1883     SDLoc dl(N);
1884
1885     // If the inserted element is an UNDEF, just use the input vector.
1886     if (InVal.getOpcode() == ISD::UNDEF)
1887       return InVec;
1888
1889     EVT VT = InVec.getValueType();
1890
1891     // If we can't generate a legal BUILD_VECTOR, exit
1892     if (!isOperationLegal(ISD::BUILD_VECTOR, VT))
1893       return SDValue();
1894
1895     // Check that we know which element is being inserted
1896     if (!isa<ConstantSDNode>(EltNo))
1897       return SDValue();
1898     unsigned Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
1899
1900     // Check that the operand is a BUILD_VECTOR (or UNDEF, which can essentially
1901     // be converted to a BUILD_VECTOR).  Fill in the Ops vector with the
1902     // vector elements.
1903     SmallVector<SDValue, 8> Ops;
1904     if (InVec.getOpcode() == ISD::BUILD_VECTOR) {
1905       Ops.append(InVec.getNode()->op_begin(),
1906                  InVec.getNode()->op_end());
1907     } else if (InVec.getOpcode() == ISD::UNDEF) {
1908       unsigned NElts = VT.getVectorNumElements();
1909       Ops.append(NElts, DAG.getUNDEF(InVal.getValueType()));
1910     } else {
1911       return SDValue();
1912     }
1913
1914     // Insert the element
1915     if (Elt < Ops.size()) {
1916       // All the operands of BUILD_VECTOR must have the same type;
1917       // we enforce that here.
1918       EVT OpVT = Ops[0].getValueType();
1919       if (InVal.getValueType() != OpVT)
1920         InVal = OpVT.bitsGT(InVal.getValueType()) ?
1921           DAG.getNode(ISD::ANY_EXTEND, dl, OpVT, InVal) :
1922           DAG.getNode(ISD::TRUNCATE, dl, OpVT, InVal);
1923       Ops[Elt] = InVal;
1924     }
1925
1926     // Return the new vector
1927     return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
1928   }
1929
1930   // Extract_vec (Build_vector) generated by custom lowering
1931   // also needs to be customly combined
1932   case ISD::EXTRACT_VECTOR_ELT: {
1933     SDValue Arg = N->getOperand(0);
1934     if (Arg.getOpcode() == ISD::BUILD_VECTOR) {
1935       if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
1936         unsigned Element = Const->getZExtValue();
1937         return Arg->getOperand(Element);
1938       }
1939     }
1940     if (Arg.getOpcode() == ISD::BITCAST &&
1941         Arg.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
1942       if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
1943         unsigned Element = Const->getZExtValue();
1944         return DAG.getNode(ISD::BITCAST, SDLoc(N), N->getVTList(),
1945             Arg->getOperand(0).getOperand(Element));
1946       }
1947     }
1948   }
1949
1950   case ISD::SELECT_CC: {
1951     // Try common optimizations
1952     SDValue Ret = AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
1953     if (Ret.getNode())
1954       return Ret;
1955
1956     // fold selectcc (selectcc x, y, a, b, cc), b, a, b, seteq ->
1957     //      selectcc x, y, a, b, inv(cc)
1958     //
1959     // fold selectcc (selectcc x, y, a, b, cc), b, a, b, setne ->
1960     //      selectcc x, y, a, b, cc
1961     SDValue LHS = N->getOperand(0);
1962     if (LHS.getOpcode() != ISD::SELECT_CC) {
1963       return SDValue();
1964     }
1965
1966     SDValue RHS = N->getOperand(1);
1967     SDValue True = N->getOperand(2);
1968     SDValue False = N->getOperand(3);
1969     ISD::CondCode NCC = cast<CondCodeSDNode>(N->getOperand(4))->get();
1970
1971     if (LHS.getOperand(2).getNode() != True.getNode() ||
1972         LHS.getOperand(3).getNode() != False.getNode() ||
1973         RHS.getNode() != False.getNode()) {
1974       return SDValue();
1975     }
1976
1977     switch (NCC) {
1978     default: return SDValue();
1979     case ISD::SETNE: return LHS;
1980     case ISD::SETEQ: {
1981       ISD::CondCode LHSCC = cast<CondCodeSDNode>(LHS.getOperand(4))->get();
1982       LHSCC = ISD::getSetCCInverse(LHSCC,
1983                                   LHS.getOperand(0).getValueType().isInteger());
1984       if (DCI.isBeforeLegalizeOps() ||
1985           isCondCodeLegal(LHSCC, LHS.getOperand(0).getSimpleValueType()))
1986         return DAG.getSelectCC(SDLoc(N),
1987                                LHS.getOperand(0),
1988                                LHS.getOperand(1),
1989                                LHS.getOperand(2),
1990                                LHS.getOperand(3),
1991                                LHSCC);
1992       break;
1993     }
1994     }
1995     return SDValue();
1996   }
1997
1998   case AMDGPUISD::EXPORT: {
1999     SDValue Arg = N->getOperand(1);
2000     if (Arg.getOpcode() != ISD::BUILD_VECTOR)
2001       break;
2002
2003     SDValue NewArgs[8] = {
2004       N->getOperand(0), // Chain
2005       SDValue(),
2006       N->getOperand(2), // ArrayBase
2007       N->getOperand(3), // Type
2008       N->getOperand(4), // SWZ_X
2009       N->getOperand(5), // SWZ_Y
2010       N->getOperand(6), // SWZ_Z
2011       N->getOperand(7) // SWZ_W
2012     };
2013     SDLoc DL(N);
2014     NewArgs[1] = OptimizeSwizzle(N->getOperand(1), &NewArgs[4], DAG);
2015     return DAG.getNode(AMDGPUISD::EXPORT, DL, N->getVTList(), NewArgs);
2016   }
2017   case AMDGPUISD::TEXTURE_FETCH: {
2018     SDValue Arg = N->getOperand(1);
2019     if (Arg.getOpcode() != ISD::BUILD_VECTOR)
2020       break;
2021
2022     SDValue NewArgs[19] = {
2023       N->getOperand(0),
2024       N->getOperand(1),
2025       N->getOperand(2),
2026       N->getOperand(3),
2027       N->getOperand(4),
2028       N->getOperand(5),
2029       N->getOperand(6),
2030       N->getOperand(7),
2031       N->getOperand(8),
2032       N->getOperand(9),
2033       N->getOperand(10),
2034       N->getOperand(11),
2035       N->getOperand(12),
2036       N->getOperand(13),
2037       N->getOperand(14),
2038       N->getOperand(15),
2039       N->getOperand(16),
2040       N->getOperand(17),
2041       N->getOperand(18),
2042     };
2043     NewArgs[1] = OptimizeSwizzle(N->getOperand(1), &NewArgs[2], DAG);
2044     return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, SDLoc(N), N->getVTList(),
2045         NewArgs);
2046   }
2047   }
2048
2049   return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
2050 }
2051
2052 static bool
2053 FoldOperand(SDNode *ParentNode, unsigned SrcIdx, SDValue &Src, SDValue &Neg,
2054             SDValue &Abs, SDValue &Sel, SDValue &Imm, SelectionDAG &DAG) {
2055   const R600InstrInfo *TII =
2056       static_cast<const R600InstrInfo *>(DAG.getSubtarget().getInstrInfo());
2057   if (!Src.isMachineOpcode())
2058     return false;
2059   switch (Src.getMachineOpcode()) {
2060   case AMDGPU::FNEG_R600:
2061     if (!Neg.getNode())
2062       return false;
2063     Src = Src.getOperand(0);
2064     Neg = DAG.getTargetConstant(1, MVT::i32);
2065     return true;
2066   case AMDGPU::FABS_R600:
2067     if (!Abs.getNode())
2068       return false;
2069     Src = Src.getOperand(0);
2070     Abs = DAG.getTargetConstant(1, MVT::i32);
2071     return true;
2072   case AMDGPU::CONST_COPY: {
2073     unsigned Opcode = ParentNode->getMachineOpcode();
2074     bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1;
2075
2076     if (!Sel.getNode())
2077       return false;
2078
2079     SDValue CstOffset = Src.getOperand(0);
2080     if (ParentNode->getValueType(0).isVector())
2081       return false;
2082
2083     // Gather constants values
2084     int SrcIndices[] = {
2085       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0),
2086       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1),
2087       TII->getOperandIdx(Opcode, AMDGPU::OpName::src2),
2088       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_X),
2089       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Y),
2090       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Z),
2091       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_W),
2092       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_X),
2093       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Y),
2094       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Z),
2095       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_W)
2096     };
2097     std::vector<unsigned> Consts;
2098     for (int OtherSrcIdx : SrcIndices) {
2099       int OtherSelIdx = TII->getSelIdx(Opcode, OtherSrcIdx);
2100       if (OtherSrcIdx < 0 || OtherSelIdx < 0)
2101         continue;
2102       if (HasDst) {
2103         OtherSrcIdx--;
2104         OtherSelIdx--;
2105       }
2106       if (RegisterSDNode *Reg =
2107           dyn_cast<RegisterSDNode>(ParentNode->getOperand(OtherSrcIdx))) {
2108         if (Reg->getReg() == AMDGPU::ALU_CONST) {
2109           ConstantSDNode *Cst
2110             = cast<ConstantSDNode>(ParentNode->getOperand(OtherSelIdx));
2111           Consts.push_back(Cst->getZExtValue());
2112         }
2113       }
2114     }
2115
2116     ConstantSDNode *Cst = cast<ConstantSDNode>(CstOffset);
2117     Consts.push_back(Cst->getZExtValue());
2118     if (!TII->fitsConstReadLimitations(Consts)) {
2119       return false;
2120     }
2121
2122     Sel = CstOffset;
2123     Src = DAG.getRegister(AMDGPU::ALU_CONST, MVT::f32);
2124     return true;
2125   }
2126   case AMDGPU::MOV_IMM_I32:
2127   case AMDGPU::MOV_IMM_F32: {
2128     unsigned ImmReg = AMDGPU::ALU_LITERAL_X;
2129     uint64_t ImmValue = 0;
2130
2131
2132     if (Src.getMachineOpcode() == AMDGPU::MOV_IMM_F32) {
2133       ConstantFPSDNode *FPC = dyn_cast<ConstantFPSDNode>(Src.getOperand(0));
2134       float FloatValue = FPC->getValueAPF().convertToFloat();
2135       if (FloatValue == 0.0) {
2136         ImmReg = AMDGPU::ZERO;
2137       } else if (FloatValue == 0.5) {
2138         ImmReg = AMDGPU::HALF;
2139       } else if (FloatValue == 1.0) {
2140         ImmReg = AMDGPU::ONE;
2141       } else {
2142         ImmValue = FPC->getValueAPF().bitcastToAPInt().getZExtValue();
2143       }
2144     } else {
2145       ConstantSDNode *C = dyn_cast<ConstantSDNode>(Src.getOperand(0));
2146       uint64_t Value = C->getZExtValue();
2147       if (Value == 0) {
2148         ImmReg = AMDGPU::ZERO;
2149       } else if (Value == 1) {
2150         ImmReg = AMDGPU::ONE_INT;
2151       } else {
2152         ImmValue = Value;
2153       }
2154     }
2155
2156     // Check that we aren't already using an immediate.
2157     // XXX: It's possible for an instruction to have more than one
2158     // immediate operand, but this is not supported yet.
2159     if (ImmReg == AMDGPU::ALU_LITERAL_X) {
2160       if (!Imm.getNode())
2161         return false;
2162       ConstantSDNode *C = dyn_cast<ConstantSDNode>(Imm);
2163       assert(C);
2164       if (C->getZExtValue())
2165         return false;
2166       Imm = DAG.getTargetConstant(ImmValue, MVT::i32);
2167     }
2168     Src = DAG.getRegister(ImmReg, MVT::i32);
2169     return true;
2170   }
2171   default:
2172     return false;
2173   }
2174 }
2175
2176
2177 /// \brief Fold the instructions after selecting them
2178 SDNode *R600TargetLowering::PostISelFolding(MachineSDNode *Node,
2179                                             SelectionDAG &DAG) const {
2180   const R600InstrInfo *TII =
2181       static_cast<const R600InstrInfo *>(DAG.getSubtarget().getInstrInfo());
2182   if (!Node->isMachineOpcode())
2183     return Node;
2184   unsigned Opcode = Node->getMachineOpcode();
2185   SDValue FakeOp;
2186
2187   std::vector<SDValue> Ops;
2188   for (const SDUse &I : Node->ops())
2189     Ops.push_back(I);
2190
2191   if (Opcode == AMDGPU::DOT_4) {
2192     int OperandIdx[] = {
2193       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_X),
2194       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Y),
2195       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Z),
2196       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_W),
2197       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_X),
2198       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Y),
2199       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Z),
2200       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_W)
2201         };
2202     int NegIdx[] = {
2203       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_X),
2204       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_Y),
2205       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_Z),
2206       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_W),
2207       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_X),
2208       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_Y),
2209       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_Z),
2210       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_W)
2211     };
2212     int AbsIdx[] = {
2213       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_X),
2214       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_Y),
2215       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_Z),
2216       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_W),
2217       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_X),
2218       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_Y),
2219       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_Z),
2220       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_W)
2221     };
2222     for (unsigned i = 0; i < 8; i++) {
2223       if (OperandIdx[i] < 0)
2224         return Node;
2225       SDValue &Src = Ops[OperandIdx[i] - 1];
2226       SDValue &Neg = Ops[NegIdx[i] - 1];
2227       SDValue &Abs = Ops[AbsIdx[i] - 1];
2228       bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1;
2229       int SelIdx = TII->getSelIdx(Opcode, OperandIdx[i]);
2230       if (HasDst)
2231         SelIdx--;
2232       SDValue &Sel = (SelIdx > -1) ? Ops[SelIdx] : FakeOp;
2233       if (FoldOperand(Node, i, Src, Neg, Abs, Sel, FakeOp, DAG))
2234         return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
2235     }
2236   } else if (Opcode == AMDGPU::REG_SEQUENCE) {
2237     for (unsigned i = 1, e = Node->getNumOperands(); i < e; i += 2) {
2238       SDValue &Src = Ops[i];
2239       if (FoldOperand(Node, i, Src, FakeOp, FakeOp, FakeOp, FakeOp, DAG))
2240         return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
2241     }
2242   } else if (Opcode == AMDGPU::CLAMP_R600) {
2243     SDValue Src = Node->getOperand(0);
2244     if (!Src.isMachineOpcode() ||
2245         !TII->hasInstrModifiers(Src.getMachineOpcode()))
2246       return Node;
2247     int ClampIdx = TII->getOperandIdx(Src.getMachineOpcode(),
2248         AMDGPU::OpName::clamp);
2249     if (ClampIdx < 0)
2250       return Node;
2251     std::vector<SDValue> Ops;
2252     unsigned NumOp = Src.getNumOperands();
2253     for(unsigned i = 0; i < NumOp; ++i)
2254           Ops.push_back(Src.getOperand(i));
2255     Ops[ClampIdx - 1] = DAG.getTargetConstant(1, MVT::i32);
2256     return DAG.getMachineNode(Src.getMachineOpcode(), SDLoc(Node),
2257         Node->getVTList(), Ops);
2258   } else {
2259     if (!TII->hasInstrModifiers(Opcode))
2260       return Node;
2261     int OperandIdx[] = {
2262       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0),
2263       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1),
2264       TII->getOperandIdx(Opcode, AMDGPU::OpName::src2)
2265     };
2266     int NegIdx[] = {
2267       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg),
2268       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg),
2269       TII->getOperandIdx(Opcode, AMDGPU::OpName::src2_neg)
2270     };
2271     int AbsIdx[] = {
2272       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs),
2273       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs),
2274       -1
2275     };
2276     for (unsigned i = 0; i < 3; i++) {
2277       if (OperandIdx[i] < 0)
2278         return Node;
2279       SDValue &Src = Ops[OperandIdx[i] - 1];
2280       SDValue &Neg = Ops[NegIdx[i] - 1];
2281       SDValue FakeAbs;
2282       SDValue &Abs = (AbsIdx[i] > -1) ? Ops[AbsIdx[i] - 1] : FakeAbs;
2283       bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1;
2284       int SelIdx = TII->getSelIdx(Opcode, OperandIdx[i]);
2285       int ImmIdx = TII->getOperandIdx(Opcode, AMDGPU::OpName::literal);
2286       if (HasDst) {
2287         SelIdx--;
2288         ImmIdx--;
2289       }
2290       SDValue &Sel = (SelIdx > -1) ? Ops[SelIdx] : FakeOp;
2291       SDValue &Imm = Ops[ImmIdx];
2292       if (FoldOperand(Node, i, Src, Neg, Abs, Sel, Imm, DAG))
2293         return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
2294     }
2295   }
2296
2297   return Node;
2298 }