lib/Target/R600/SIISelLowering.cpp

   1 //===-- SIISelLowering.cpp - SI DAG Lowering Implementation ---------------===//
   2 //
   3 //                     The LLVM Compiler Infrastructure
   4 //
   5 // This file is distributed under the University of Illinois Open Source
   6 // License. See LICENSE.TXT for details.
   7 //
   8 //===----------------------------------------------------------------------===//
   9 //
  10 /// \file
  11 /// \brief Custom DAG lowering for SI
  12 //
  13 //===----------------------------------------------------------------------===//
  14
  15 #include "SIISelLowering.h"
  16 #include "AMDIL.h"
  17 #include "AMDILIntrinsicInfo.h"
  18 #include "SIInstrInfo.h"
  19 #include "SIMachineFunctionInfo.h"
  20 #include "SIRegisterInfo.h"
  21 #include "llvm/CodeGen/MachineInstrBuilder.h"
  22 #include "llvm/CodeGen/MachineRegisterInfo.h"
  23 #include "llvm/CodeGen/SelectionDAG.h"
  24
  25 using namespace llvm;
  26
  27 SITargetLowering::SITargetLowering(TargetMachine &TM) :
  28     AMDGPUTargetLowering(TM),
  29     TII(static_cast<const SIInstrInfo*>(TM.getInstrInfo())) {
  30   addRegisterClass(MVT::v4f32, &AMDGPU::VReg_128RegClass);
  31   addRegisterClass(MVT::f32, &AMDGPU::VReg_32RegClass);
  32   addRegisterClass(MVT::i32, &AMDGPU::VReg_32RegClass);
  33   addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass);
  34   addRegisterClass(MVT::i1, &AMDGPU::SCCRegRegClass);
  35   addRegisterClass(MVT::i1, &AMDGPU::VCCRegRegClass);
  36
  37   addRegisterClass(MVT::v1i32, &AMDGPU::VReg_32RegClass);
  38   addRegisterClass(MVT::v2i32, &AMDGPU::VReg_64RegClass);
  39   addRegisterClass(MVT::v4i32, &AMDGPU::VReg_128RegClass);
  40   addRegisterClass(MVT::v8i32, &AMDGPU::VReg_256RegClass);
  41   addRegisterClass(MVT::v16i32, &AMDGPU::VReg_512RegClass);
  42
  43   computeRegisterProperties();
  44
  45   setOperationAction(ISD::AND, MVT::i1, Custom);
  46
  47   setOperationAction(ISD::ADD, MVT::i64, Legal);
  48   setOperationAction(ISD::ADD, MVT::i32, Legal);
  49
  50   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
  51
  52   // We need to custom lower loads from the USER_SGPR address space, so we can
  53   // add the SGPRs as livein registers.
  54   setOperationAction(ISD::LOAD, MVT::i32, Custom);
  55   setOperationAction(ISD::LOAD, MVT::i64, Custom);
  56
  57   setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
  58   setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
  59
  60   setOperationAction(ISD::SELECT_CC, MVT::Other, Expand);
  61   setTargetDAGCombine(ISD::SELECT_CC);
  62
  63   setTargetDAGCombine(ISD::SETCC);
  64 }
  65
  66 MachineBasicBlock * SITargetLowering::EmitInstrWithCustomInserter(
  67     MachineInstr * MI, MachineBasicBlock * BB) const {
  68   const TargetInstrInfo * TII = getTargetMachine().getInstrInfo();
  69   MachineRegisterInfo & MRI = BB->getParent()->getRegInfo();
  70   MachineBasicBlock::iterator I = MI;
  71
  72   switch (MI->getOpcode()) {
  73   default:
  74     return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
  75   case AMDGPU::BRANCH: return BB;
  76   case AMDGPU::CLAMP_SI:
  77     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::V_MOV_B32_e64))
  78            .addOperand(MI->getOperand(0))
  79            .addOperand(MI->getOperand(1))
  80            // VSRC1-2 are unused, but we still need to fill all the
  81            // operand slots, so we just reuse the VSRC0 operand
  82            .addOperand(MI->getOperand(1))
  83            .addOperand(MI->getOperand(1))
  84            .addImm(0) // ABS
  85            .addImm(1) // CLAMP
  86            .addImm(0) // OMOD
  87            .addImm(0); // NEG
  88     MI->eraseFromParent();
  89     break;
  90
  91   case AMDGPU::FABS_SI:
  92     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::V_MOV_B32_e64))
  93                  .addOperand(MI->getOperand(0))
  94                  .addOperand(MI->getOperand(1))
  95                  // VSRC1-2 are unused, but we still need to fill all the
  96                  // operand slots, so we just reuse the VSRC0 operand
  97                  .addOperand(MI->getOperand(1))
  98                  .addOperand(MI->getOperand(1))
  99                  .addImm(1) // ABS
 100                  .addImm(0) // CLAMP
 101                  .addImm(0) // OMOD
 102                  .addImm(0); // NEG
 103     MI->eraseFromParent();
 104     break;
 105
 106   case AMDGPU::FNEG_SI:
 107     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::V_MOV_B32_e64))
 108                  .addOperand(MI->getOperand(0))
 109                  .addOperand(MI->getOperand(1))
 110                  // VSRC1-2 are unused, but we still need to fill all the
 111                  // operand slots, so we just reuse the VSRC0 operand
 112                  .addOperand(MI->getOperand(1))
 113                  .addOperand(MI->getOperand(1))
 114                  .addImm(0) // ABS
 115                  .addImm(0) // CLAMP
 116                  .addImm(0) // OMOD
 117                  .addImm(1); // NEG
 118     MI->eraseFromParent();
 119     break;
 120   case AMDGPU::SHADER_TYPE:
 121     BB->getParent()->getInfo<SIMachineFunctionInfo>()->ShaderType =
 122                                         MI->getOperand(0).getImm();
 123     MI->eraseFromParent();
 124     break;
 125
 126   case AMDGPU::SI_INTERP:
 127     LowerSI_INTERP(MI, *BB, I, MRI);
 128     break;
 129   case AMDGPU::SI_INTERP_CONST:
 130     LowerSI_INTERP_CONST(MI, *BB, I, MRI);
 131     break;
 132   case AMDGPU::SI_WQM:
 133     LowerSI_WQM(MI, *BB, I, MRI);
 134     break;
 135   case AMDGPU::SI_V_CNDLT:
 136     LowerSI_V_CNDLT(MI, *BB, I, MRI);
 137     break;
 138   }
 139   return BB;
 140 }
 141
 142 void SITargetLowering::LowerSI_WQM(MachineInstr *MI, MachineBasicBlock &BB,
 143     MachineBasicBlock::iterator I, MachineRegisterInfo & MRI) const {
 144   BuildMI(BB, I, BB.findDebugLoc(I), TII->get(AMDGPU::S_WQM_B64), AMDGPU::EXEC)
 145           .addReg(AMDGPU::EXEC);
 146
 147   MI->eraseFromParent();
 148 }
 149
 150 void SITargetLowering::LowerSI_INTERP(MachineInstr *MI, MachineBasicBlock &BB,
 151     MachineBasicBlock::iterator I, MachineRegisterInfo & MRI) const {
 152   unsigned tmp = MRI.createVirtualRegister(&AMDGPU::VReg_32RegClass);
 153   unsigned M0 = MRI.createVirtualRegister(&AMDGPU::M0RegRegClass);
 154   MachineOperand dst = MI->getOperand(0);
 155   MachineOperand iReg = MI->getOperand(1);
 156   MachineOperand jReg = MI->getOperand(2);
 157   MachineOperand attr_chan = MI->getOperand(3);
 158   MachineOperand attr = MI->getOperand(4);
 159   MachineOperand params = MI->getOperand(5);
 160
 161   BuildMI(BB, I, BB.findDebugLoc(I), TII->get(AMDGPU::S_MOV_B32), M0)
 162           .addOperand(params);
 163
 164   BuildMI(BB, I, BB.findDebugLoc(I), TII->get(AMDGPU::V_INTERP_P1_F32), tmp)
 165           .addOperand(iReg)
 166           .addOperand(attr_chan)
 167           .addOperand(attr)
 168           .addReg(M0);
 169
 170   BuildMI(BB, I, BB.findDebugLoc(I), TII->get(AMDGPU::V_INTERP_P2_F32))
 171           .addOperand(dst)
 172           .addReg(tmp)
 173           .addOperand(jReg)
 174           .addOperand(attr_chan)
 175           .addOperand(attr)
 176           .addReg(M0);
 177
 178   MI->eraseFromParent();
 179 }
 180
 181 void SITargetLowering::LowerSI_INTERP_CONST(MachineInstr *MI,
 182     MachineBasicBlock &BB, MachineBasicBlock::iterator I,
 183     MachineRegisterInfo &MRI) const {
 184   MachineOperand dst = MI->getOperand(0);
 185   MachineOperand attr_chan = MI->getOperand(1);
 186   MachineOperand attr = MI->getOperand(2);
 187   MachineOperand params = MI->getOperand(3);
 188   unsigned M0 = MRI.createVirtualRegister(&AMDGPU::M0RegRegClass);
 189
 190   BuildMI(BB, I, BB.findDebugLoc(I), TII->get(AMDGPU::S_MOV_B32), M0)
 191           .addOperand(params);
 192
 193   BuildMI(BB, I, BB.findDebugLoc(I), TII->get(AMDGPU::V_INTERP_MOV_F32))
 194           .addOperand(dst)
 195           .addOperand(attr_chan)
 196           .addOperand(attr)
 197           .addReg(M0);
 198
 199   MI->eraseFromParent();
 200 }
 201
 202 void SITargetLowering::LowerSI_V_CNDLT(MachineInstr *MI, MachineBasicBlock &BB,
 203     MachineBasicBlock::iterator I, MachineRegisterInfo & MRI) const {
 204   unsigned VCC = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
 205
 206   BuildMI(BB, I, BB.findDebugLoc(I),
 207           TII->get(AMDGPU::V_CMP_GT_F32_e32),
 208           VCC)
 209           .addReg(AMDGPU::SREG_LIT_0)
 210           .addOperand(MI->getOperand(1));
 211
 212   BuildMI(BB, I, BB.findDebugLoc(I), TII->get(AMDGPU::V_CNDMASK_B32_e32))
 213           .addOperand(MI->getOperand(0))
 214           .addOperand(MI->getOperand(3))
 215           .addOperand(MI->getOperand(2))
 216           .addReg(VCC);
 217
 218   MI->eraseFromParent();
 219 }
 220
 221 EVT SITargetLowering::getSetCCResultType(EVT VT) const {
 222   return MVT::i1;
 223 }
 224
 225 //===----------------------------------------------------------------------===//
 226 // Custom DAG Lowering Operations
 227 //===----------------------------------------------------------------------===//
 228
 229 SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
 230   switch (Op.getOpcode()) {
 231   default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
 232   case ISD::BRCOND: return LowerBRCOND(Op, DAG);
 233   case ISD::LOAD: return LowerLOAD(Op, DAG);
 234   case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
 235   case ISD::AND: return Loweri1ContextSwitch(Op, DAG, ISD::AND);
 236   case ISD::INTRINSIC_WO_CHAIN: {
 237     unsigned IntrinsicID =
 238                          cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
 239     EVT VT = Op.getValueType();
 240     switch (IntrinsicID) {
 241     case AMDGPUIntrinsic::SI_vs_load_buffer_index:
 242       return CreateLiveInRegister(DAG, &AMDGPU::VReg_32RegClass,
 243                                   AMDGPU::VGPR0, VT);
 244     default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
 245     }
 246     break;
 247   }
 248   }
 249   return SDValue();
 250 }
 251
 252 /// \brief The function is for lowering i1 operations on the
 253 /// VCC register.
 254 ///
 255 /// In the VALU context, VCC is a one bit register, but in the
 256 /// SALU context the VCC is a 64-bit register (1-bit per thread).  Since only
 257 /// the SALU can perform operations on the VCC register, we need to promote
 258 /// the operand types from i1 to i64 in order for tablegen to be able to match
 259 /// this operation to the correct SALU instruction.  We do this promotion by
 260 /// wrapping the operands in a CopyToReg node.
 261 ///
 262 SDValue SITargetLowering::Loweri1ContextSwitch(SDValue Op,
 263                                                SelectionDAG &DAG,
 264                                                unsigned VCCNode) const {
 265   DebugLoc DL = Op.getDebugLoc();
 266
 267   SDValue OpNode = DAG.getNode(VCCNode, DL, MVT::i64,
 268                                DAG.getNode(SIISD::VCC_BITCAST, DL, MVT::i64,
 269                                            Op.getOperand(0)),
 270                                DAG.getNode(SIISD::VCC_BITCAST, DL, MVT::i64,
 271                                            Op.getOperand(1)));
 272
 273   return DAG.getNode(SIISD::VCC_BITCAST, DL, MVT::i1, OpNode);
 274 }
 275
 276 /// \brief Helper function for LowerBRCOND
 277 static SDNode *findUser(SDValue Value, unsigned Opcode) {
 278
 279   SDNode *Parent = Value.getNode();
 280   for (SDNode::use_iterator I = Parent->use_begin(), E = Parent->use_end();
 281        I != E; ++I) {
 282
 283     if (I.getUse().get() != Value)
 284       continue;
 285
 286     if (I->getOpcode() == Opcode)
 287       return *I;
 288   }
 289   return 0;
 290 }
 291
 292 /// This transforms the control flow intrinsics to get the branch destination as
 293 /// last parameter, also switches branch target with BR if the need arise
 294 SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND,
 295                                       SelectionDAG &DAG) const {
 296
 297   DebugLoc DL = BRCOND.getDebugLoc();
 298
 299   SDNode *Intr = BRCOND.getOperand(1).getNode();
 300   SDValue Target = BRCOND.getOperand(2);
 301   SDNode *BR = 0;
 302
 303   if (Intr->getOpcode() == ISD::SETCC) {
 304     // As long as we negate the condition everything is fine
 305     SDNode *SetCC = Intr;
 306     assert(SetCC->getConstantOperandVal(1) == 1);
 307     assert(cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get() ==
 308            ISD::SETNE);
 309     Intr = SetCC->getOperand(0).getNode();
 310
 311   } else {
 312     // Get the target from BR if we don't negate the condition
 313     BR = findUser(BRCOND, ISD::BR);
 314     Target = BR->getOperand(1);
 315   }
 316
 317   assert(Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN);
 318
 319   // Build the result and
 320   SmallVector<EVT, 4> Res;
 321   for (unsigned i = 1, e = Intr->getNumValues(); i != e; ++i)
 322     Res.push_back(Intr->getValueType(i));
 323
 324   // operands of the new intrinsic call
 325   SmallVector<SDValue, 4> Ops;
 326   Ops.push_back(BRCOND.getOperand(0));
 327   for (unsigned i = 1, e = Intr->getNumOperands(); i != e; ++i)
 328     Ops.push_back(Intr->getOperand(i));
 329   Ops.push_back(Target);
 330
 331   // build the new intrinsic call
 332   SDNode *Result = DAG.getNode(
 333     Res.size() > 1 ? ISD::INTRINSIC_W_CHAIN : ISD::INTRINSIC_VOID, DL,
 334     DAG.getVTList(Res.data(), Res.size()), Ops.data(), Ops.size()).getNode();
 335
 336   if (BR) {
 337     // Give the branch instruction our target
 338     SDValue Ops[] = {
 339       BR->getOperand(0),
 340       BRCOND.getOperand(2)
 341     };
 342     DAG.MorphNodeTo(BR, ISD::BR, BR->getVTList(), Ops, 2);
 343   }
 344
 345   SDValue Chain = SDValue(Result, Result->getNumValues() - 1);
 346
 347   // Copy the intrinsic results to registers
 348   for (unsigned i = 1, e = Intr->getNumValues() - 1; i != e; ++i) {
 349     SDNode *CopyToReg = findUser(SDValue(Intr, i), ISD::CopyToReg);
 350     if (!CopyToReg)
 351       continue;
 352
 353     Chain = DAG.getCopyToReg(
 354       Chain, DL,
 355       CopyToReg->getOperand(1),
 356       SDValue(Result, i - 1),
 357       SDValue());
 358
 359     DAG.ReplaceAllUsesWith(SDValue(CopyToReg, 0), CopyToReg->getOperand(0));
 360   }
 361
 362   // Remove the old intrinsic from the chain
 363   DAG.ReplaceAllUsesOfValueWith(
 364     SDValue(Intr, Intr->getNumValues() - 1),
 365     Intr->getOperand(0));
 366
 367   return Chain;
 368 }
 369
 370 SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
 371   EVT VT = Op.getValueType();
 372   LoadSDNode *Ptr = dyn_cast<LoadSDNode>(Op);
 373
 374   assert(Ptr);
 375
 376   unsigned AddrSpace = Ptr->getPointerInfo().getAddrSpace();
 377
 378   // We only need to lower USER_SGPR address space loads
 379   if (AddrSpace != AMDGPUAS::USER_SGPR_ADDRESS) {
 380     return SDValue();
 381   }
 382
 383   // Loads from the USER_SGPR address space can only have constant value
 384   // pointers.
 385   ConstantSDNode *BasePtr = dyn_cast<ConstantSDNode>(Ptr->getBasePtr());
 386   assert(BasePtr);
 387
 388   unsigned TypeDwordWidth = VT.getSizeInBits() / 32;
 389   const TargetRegisterClass * dstClass;
 390   switch (TypeDwordWidth) {
 391     default:
 392       assert(!"USER_SGPR value size not implemented");
 393       return SDValue();
 394     case 1:
 395       dstClass = &AMDGPU::SReg_32RegClass;
 396       break;
 397     case 2:
 398       dstClass = &AMDGPU::SReg_64RegClass;
 399       break;
 400   }
 401   uint64_t Index = BasePtr->getZExtValue();
 402   assert(Index % TypeDwordWidth == 0 && "USER_SGPR not properly aligned");
 403   unsigned SGPRIndex = Index / TypeDwordWidth;
 404   unsigned Reg = dstClass->getRegister(SGPRIndex);
 405
 406   DAG.ReplaceAllUsesOfValueWith(Op, CreateLiveInRegister(DAG, dstClass, Reg,
 407                                                          VT));
 408   return SDValue();
 409 }
 410
 411 SDValue SITargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
 412   SDValue LHS = Op.getOperand(0);
 413   SDValue RHS = Op.getOperand(1);
 414   SDValue True = Op.getOperand(2);
 415   SDValue False = Op.getOperand(3);
 416   SDValue CC = Op.getOperand(4);
 417   EVT VT = Op.getValueType();
 418   DebugLoc DL = Op.getDebugLoc();
 419
 420   // Possible Min/Max pattern
 421   SDValue MinMax = LowerMinMax(Op, DAG);
 422   if (MinMax.getNode()) {
 423     return MinMax;
 424   }
 425
 426   SDValue Cond = DAG.getNode(ISD::SETCC, DL, MVT::i1, LHS, RHS, CC);
 427   return DAG.getNode(ISD::SELECT, DL, VT, Cond, True, False);
 428 }
 429
 430 //===----------------------------------------------------------------------===//
 431 // Custom DAG optimizations
 432 //===----------------------------------------------------------------------===//
 433
 434 SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
 435                                             DAGCombinerInfo &DCI) const {
 436   SelectionDAG &DAG = DCI.DAG;
 437   DebugLoc DL = N->getDebugLoc();
 438   EVT VT = N->getValueType(0);
 439
 440   switch (N->getOpcode()) {
 441     default: break;
 442     case ISD::SELECT_CC: {
 443       N->dump();
 444       ConstantSDNode *True, *False;
 445       // i1 selectcc(l, r, -1, 0, cc) -> i1 setcc(l, r, cc)
 446       if ((True = dyn_cast<ConstantSDNode>(N->getOperand(2)))
 447           && (False = dyn_cast<ConstantSDNode>(N->getOperand(3)))
 448           && True->isAllOnesValue()
 449           && False->isNullValue()
 450           && VT == MVT::i1) {
 451         return DAG.getNode(ISD::SETCC, DL, VT, N->getOperand(0),
 452                            N->getOperand(1), N->getOperand(4));
 453
 454       }
 455       break;
 456     }
 457     case ISD::SETCC: {
 458       SDValue Arg0 = N->getOperand(0);
 459       SDValue Arg1 = N->getOperand(1);
 460       SDValue CC = N->getOperand(2);
 461       ConstantSDNode * C = NULL;
 462       ISD::CondCode CCOp = dyn_cast<CondCodeSDNode>(CC)->get();
 463
 464       // i1 setcc (sext(i1), 0, setne) -> i1 setcc(i1, 0, setne)
 465       if (VT == MVT::i1
 466           && Arg0.getOpcode() == ISD::SIGN_EXTEND
 467           && Arg0.getOperand(0).getValueType() == MVT::i1
 468           && (C = dyn_cast<ConstantSDNode>(Arg1))
 469           && C->isNullValue()
 470           && CCOp == ISD::SETNE) {
 471         return SimplifySetCC(VT, Arg0.getOperand(0),
 472                              DAG.getConstant(0, MVT::i1), CCOp, true, DCI, DL);
 473       }
 474       break;
 475     }
 476   }
 477   return SDValue();
 478 }
 479
 480 #define NODE_NAME_CASE(node) case SIISD::node: return #node;
 481
 482 const char* SITargetLowering::getTargetNodeName(unsigned Opcode) const {
 483   switch (Opcode) {
 484   default: return AMDGPUTargetLowering::getTargetNodeName(Opcode);
 485   NODE_NAME_CASE(VCC_AND)
 486   NODE_NAME_CASE(VCC_BITCAST)
 487   }
 488 }